### Imports

In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import sys
import json
import os
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from terminaltables import AsciiTable
from collections import Counter
from nltk.corpus import wordnet as wn
from collections import defaultdict

os.makedirs("data", exist_ok=True)



### Load dataset

In [2]:
article_db = None

if os.path.exists('data/articles'):
    # Load saved reduced article db
    print ("Loading Article DB...")
    reduced_article_db = pd.read_pickle('data/articles')
    print ("+ Done.")
else:
    # Load full article DB
    lines = []
    with open('data/signalmedia-1m.jsonl', 'r') as f:
        for json_article in tqdm(f.readlines(), desc='Loading articles'):
            article = dict(json.loads(json_article))
            lines.append(article)

    print ("Load as Pandas DF...")
    article_db = pd.DataFrame(lines)
    print ("+ Done.")

    # Remove rows that has NaN values
    article_db.dropna(inplace=True)

Loading articles: 100%|██████████| 1000000/1000000 [00:53<00:00, 18830.42it/s]


Load as Pandas DF...
+ Done.


### Display column names

In [3]:
columns = article_db.columns if article_db is not None else reduced_article_db.columns
# Show column names
print ("Columns:")
for col in columns:
    print ("+ %s" % col)

Columns:
+ content
+ id
+ media-type
+ published
+ source
+ title


### Reduce and balance dataset

In [4]:
def print_media_occ(articles):
    """Count number of occurences of each class and print as a table"""
    num_news = np.sum(articles['blog'] == 0)
    num_blogs = np.sum(articles['blog'] == 1)
    
    table_data = [["Media Type", "Count"]]
    for media_type, count in [("News", num_news), ("Blog", num_blogs)]:
        table_data.append([media_type, count])
    print (AsciiTable(table_data).table)
    
if article_db is not None:
    # One-hot encoding of 'media-type': 'News' = 0, 'Blog = 1
    article_db.loc[article_db['media-type'] == 'News', 'blog'] = 0
    article_db.loc[article_db['media-type'] == 'Blog', 'blog'] = 1

    print_media_occ(article_db)
    
    print ("\nReduce and balance dataset...")
    n_of_each = 10000
    news_articles = article_db[article_db['blog'] == 0].iloc[:n_of_each]
    blog_articles = article_db[article_db['blog'] == 1].iloc[:n_of_each]
    reduced_article_db = pd.concat([news_articles, blog_articles])
    print ("+ Done.\n")
    
    print_media_occ(reduced_article_db)
    
    reduced_article_db.to_pickle("data/articles")
    

+------------+--------+
| Media Type | Count  |
+------------+--------+
| News       | 734488 |
| Blog       | 265512 |
+------------+--------+

Reduce and balance dataset...
+ Done.

+------------+-------+
| Media Type | Count |
+------------+-------+
| News       | 10000 |
| Blog       | 10000 |
+------------+-------+


### Helper methods for feature extraction

In [5]:
lmtzr = nltk.stem.wordnet.WordNetLemmatizer()
sid = SentimentIntensityAnalyzer()

# Lemmatizing (the proper way, accounting for different POS tags)
def penn_to_wn(penn_tag):
    """
    Returns the corresponding WordNet POS tag for a Penn TreeBank POS tag.
    """
    if penn_tag in ['NN', 'NNS', 'NNP', 'NNPS']:
        wn_tag = wn.NOUN
    elif penn_tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        wn_tag = wn.VERB
    elif penn_tag in ['RB', 'RBR', 'RBS']:
        wn_tag = wn.ADV
    elif penn_tag in ['JJ', 'JJR', 'JJS']:
        wn_tag = wn.ADJ
    else:
        wn_tag = None
    return wn_tag

def get_lemmas(tokens):
    """Lemmatize the list of tokens"""
    tagged_tokens = nltk.pos_tag(tokens)
    lemmas = []
    for token, pos in tagged_tokens:
        wn_tag = penn_to_wn(pos)
        lemma = lmtzr.lemmatize(token) if not wn_tag else lmtzr.lemmatize(token, wn_tag)
        lemmas.append(lemma)
    return lemmas

def token_count_in_articles(articles):
    """
    Return occurence count of tokens in all articles.

    :param DataFrame articles: Collection of articles
    :returns int: A count value
    """
    total_count = defaultdict(int)
    for i in tqdm(range(len(articles)), desc='Counting token occurences'):
        text = articles.iloc[i]['content'].lower()
        token_count = get_token_count(text)
        for token, count in token_count.items():
            total_count[token] += count
    return total_count

def get_token_count(text):
    """
    Count occurences of tokens in text.

    :param str text: Text in which method counts tokens
    :returns dict: Dict of tokens and their count
    """
    tokens = nltk.word_tokenize(text)
    token_counts = dict(Counter(tokens))
    return token_counts

### Display discrepancies of token occurences between distributions

In [6]:
news_articles = reduced_article_db[reduced_article_db['blog'] == 0]
blog_articles = reduced_article_db[reduced_article_db['blog'] == 1]
  
token_count_news = token_count_in_articles(news_articles)
token_count_blogs = token_count_in_articles(blog_articles)

# Get all tokens
tokens = set(token_count_news.keys())
tokens.update(list(token_count_blogs.keys()))

# Get the token occurence difference between news and blog articles
diff_count = defaultdict(int)
for token in tqdm(tokens, desc='Determine occurence diff.'):
    diff_count[token] += token_count_blogs.get(token, 0)
    diff_count[token] -= token_count_news.get(token, 0)

# Sort the tokens by the largest occurence difference
sorted_diff_count = sorted(diff_count.items(), key=lambda x: x[1], reverse=True)
first_hundred = sorted_diff_count[:100]
last_hundred = sorted_diff_count[-100::-1]

# Display the tokens with the largest difference 
# (idea is that the count of tokens with a large occurence diff will make good features)
print (AsciiTable([['Tokens with largest occurence diff. between media types']]).table)
print ("- Positive values indicate a higher occurence in blogs than news")
table_data = [['Num.', 'Token', 'Count Diff', 'Token', 'Count Diff']]
for i in range(len(first_hundred)):
    f_token, f_diff = first_hundred[i]
    l_token, l_diff = last_hundred[i]
    table_data.append([i, f_token, f_diff, l_token, l_diff])
print (AsciiTable(table_data).table)


Counting token occurences: 100%|██████████| 10000/10000 [00:52<00:00, 191.75it/s]
Counting token occurences: 100%|██████████| 10000/10000 [00:44<00:00, 225.82it/s]
Determine occurence diff.: 100%|██████████| 232763/232763 [00:00<00:00, 497422.79it/s]


+---------------------------------------------------------+
| Tokens with largest occurence diff. between media types |
+---------------------------------------------------------+
- Positive values indicate a higher occurence in blogs than news
+------+-----------+------------+---------------+------------+
| Num. | Token     | Count Diff | Token         | Count Diff |
+------+-----------+------------+---------------+------------+
| 0    | i         | 17999      | with          | -872       |
| 1    | you       | 16514      | customers     | -867       |
| 2    | ’         | 14686      | told          | -866       |
| 3    | :         | 10121      | group         | -863       |
| 4    | !         | 8657       | our           | -862       |
| 5    | it        | 8016       | rights        | -849       |
| 6    | your      | 7418       | against       | -837       |
| 7    | s         | 6927       | thursday      | -834       |
| 8    | my        | 6512       | australia     | -824       |

In [7]:
# Tokens whose occurence count will act as features
# Selected based on table above
tokens_to_check = ['i', 
                   'you', 
                   'me', 
                   'we', 
                   'my', 
                   'this', 
                   'that', 
                   'it', 
                   'like',
                   '--', 
                   '-', 
                   "''", 
                   '%', 
                   'said',
                   'told',
                   '?', 
                   '!', 
                   '’', 
                   ':']

### Build feature set

In [8]:

lmtzr = nltk.stem.wordnet.WordNetLemmatizer()
sid = SentimentIntensityAnalyzer()

feature_set = []
for i in tqdm(range(reduced_article_db.shape[0]), desc='Building feature set'):
    article = reduced_article_db.iloc[i]
    article_text = article['content']
    
    article_features = {}
    article_features['article_length'] = len(article_text)
    
    # Get token count in the article
    token_counts = get_token_count(article_text.lower())

    # Token count features
    for token in tokens_to_check:
        article_features[token] = token_counts.get(token, 0)
        
    # Sentiment features
    sentiment = sid.polarity_scores(article_text)
    article_features['positivity'] = sentiment['pos']
    article_features['negativity'] = sentiment['neg']
    article_features['neutral'] = sentiment['neu']
    article_features['compound'] = sentiment['compound']
    
    # Target value
    article_features['blog'] = article['blog']
    
    feature_set.append(article_features)
    
dataset = pd.DataFrame(feature_set)

Building feature set: 100%|██████████| 20000/20000 [04:17<00:00, 77.81it/s]


### Save dataset

In [9]:
# Save dataset
dataset.to_pickle('data/dataset')