### Imports

In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import sys
import json
import os
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from terminaltables import AsciiTable
from collections import Counter
from nltk.corpus import wordnet as wn
from collections import defaultdict

os.makedirs("data", exist_ok=True)



### Load dataset

In [2]:
article_db = None

if os.path.exists("data/articles"):
    # Load saved reduced article db
    print ("Loading Article DB...")
    reduced_article_db = pd.read_pickle("data/articles")
    print ("+ Done.")
else:
    # Load full article DB and load as Pandas DF
    lines = []
    with open('data/signalmedia-1m.jsonl', 'r') as f:
        for json_article in tqdm(f.readlines(), desc="Loading articles"):
            article = dict(json.loads(json_article))
            lines.append(article)

    print ("Load as Pandas DF...")
    article_db = pd.DataFrame(lines)
    print ("+ Done.")

    # Remove rows that has NaN values
    article_db.dropna(inplace=True)

Loading articles: 100%|██████████| 1000000/1000000 [00:52<00:00, 19072.63it/s]


Load as Pandas DF...
+ Done.


### Display column names

In [3]:
columns = article_db.columns if article_db is not None else reduced_article_db.columns
# Show column names
print ("Columns:")
for col in columns:
    print ("+ %s" % col)

Columns:
+ content
+ id
+ media-type
+ published
+ source
+ title


### Reduce and balance dataset

In [4]:
def print_media_occ(articles):
    """Count number of occurences of each class and print as a table"""
    num_news = np.sum(articles['blog'] == 0)
    num_blogs = np.sum(articles['blog'] == 1)
    
    table_data = [["Media Type", "Count"]]
    for media_type, count in [("News", num_news), ("Blog", num_blogs)]:
        table_data.append([media_type, count])
    print (AsciiTable(table_data).table)
    
if article_db is not None:
    # One-hot encoding of 'media-type': 'News' = 0, 'Blog = 1
    article_db.loc[article_db['media-type'] == 'News', 'blog'] = 0
    article_db.loc[article_db['media-type'] == 'Blog', 'blog'] = 1

    print_media_occ(article_db)
    
    print ("\nReduce and balance dataset...")
    n_of_each = 5000
    # Pick n_of_each articles of each media-type
    news_articles = article_db[article_db['blog'] == 0].iloc[:n_of_each]
    blog_articles = article_db[article_db['blog'] == 1].iloc[:n_of_each]
    # Save as reduced article dataset
    reduced_article_db = pd.concat([news_articles, blog_articles])
    print ("+ Done.\n")
    
    print_media_occ(reduced_article_db)
    
    reduced_article_db.to_pickle("data/articles")
    

+------------+--------+
| Media Type | Count  |
+------------+--------+
| News       | 734488 |
| Blog       | 265512 |
+------------+--------+

Reduce and balance dataset...
+ Done.

+------------+-------+
| Media Type | Count |
+------------+-------+
| News       | 5000  |
| Blog       | 5000  |
+------------+-------+


### Helper methods for feature extraction

In [5]:
lmtzr = nltk.stem.wordnet.WordNetLemmatizer()
sid = SentimentIntensityAnalyzer()


# Lemmatizing (the proper way, accounting for different POS tags)
def penn_to_wn(penn_tag):
    """
    Returns the corresponding WordNet POS tag for a Penn TreeBank POS tag.
    """
    if penn_tag in ['NN', 'NNS', 'NNP', 'NNPS']:
        wn_tag = wn.NOUN
    elif penn_tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        wn_tag = wn.VERB
    elif penn_tag in ['RB', 'RBR', 'RBS']:
        wn_tag = wn.ADV
    elif penn_tag in ['JJ', 'JJR', 'JJS']:
        wn_tag = wn.ADJ
    else:
        wn_tag = None
    return wn_tag

def get_lemmas(tokens):
    """Lemmatize the list of tokens"""
    tagged_tokens = nltk.pos_tag(tokens)
    lemmas = []
    for token, pos in tagged_tokens:
        wn_tag = penn_to_wn(pos)
        lemma = lmtzr.lemmatize(token) if not wn_tag else lmtzr.lemmatize(token, wn_tag)
        lemmas.append(lemma)
    return lemmas

def get_token_count(articles):
    """
    Return a collective count for the tokens in the articles.

    :param DataFrame articles: Collection of articles
    :returns int: A count value
    """
    total_count = defaultdict(int)
    for i in tqdm(range(len(articles)), desc="Counting token occurences"):
        text = articles.iloc[i]['content'].lower()
        tokens = nltk.word_tokenize(text)
        token_count = dict(Counter(tokens))
        for token, count in token_count.items():
            total_count[token] += count
    return total_count

def sum_token_count(token_counts, tokens_to_check):
    """
    For the list tokens_to_check, return the collective count of those tokens
    according to token_counts.

    :param list token_counts: Dictionary with tokens and counts
    :param list tokens_to_check: List of tokens
    :returns int: A count value
    """
    cnt = 0
    for token in tokens_to_check:
        cnt += token_counts.get(token, 0)
    return cnt

### Display discrepancies of token occurences between distributions

In [6]:
news_articles = reduced_article_db[reduced_article_db['blog'] == 0]
blog_articles = reduced_article_db[reduced_article_db['blog'] == 1]
  
token_count_news = get_token_count(news_articles)
token_count_blogs = get_token_count(blog_articles)

# Get all tokens
tokens = list(token_count_news.keys())
tokens.extend(list(token_count_blogs.keys()))
tokens = set(tokens)

# Get the token occurence difference between news and blog articles
diff_count = defaultdict(int)
for token in tqdm(tokens, desc="Determine occurence diff."):
    diff_count[token] += token_count_blogs.get(token, 0)
    diff_count[token] -= token_count_news.get(token, 0)

# Sort the tokens by the largest occurence diff. and display the 200 where
# the difference was the largest
sorted_diff_count = sorted(diff_count.items(), key=lambda x: x[1], reverse=True)
first_hundred = sorted_diff_count[:100]
last_hundred = sorted_diff_count[-100::-1]

# Display the tokens with the largest difference 
# (idea is that the count of tokens with a large occurence diff will make good features)
table_data = [["Num.", "Token", "Count Diff", "Token", "Count Diff"]]
for i, ((f_token, f_diff), (l_token, l_diff)) in enumerate(zip(first_hundred, last_hundred)):
    table_data.append([i, f_token, f_diff, l_token, l_diff])
print (AsciiTable(table_data).table)


Counting token occurences: 100%|██████████| 5000/5000 [00:25<00:00, 196.52it/s]
Counting token occurences: 100%|██████████| 5000/5000 [00:23<00:00, 213.98it/s]
Determine occurence diff.: 100%|██████████| 149710/149710 [00:00<00:00, 450530.58it/s]


+------+-----------+------------+---------------+------------+
| Num. | Token     | Count Diff | Token         | Count Diff |
+------+-----------+------------+---------------+------------+
| 0    | i         | 8628       | results       | -464       |
| 1    | you       | 8619       | thursday      | -462       |
| 2    | ’         | 6995       | cent          | -458       |
| 3    | :         | 4881       | country       | -450       |
| 4    | !         | 4119       | rating        | -446       |
| 5    | it        | 4100       | billion       | -444       |
| 6    | your      | 4002       | average       | -439       |
| 7    | s         | 3305       | rights        | -438       |
| 8    | my        | 3299       | u.s.          | -438       |
| 9    | this      | 2874       | r             | -434       |
| 10   | ?         | 2864       | told          | -433       |
| 11   | that      | 2307       | against       | -432       |
| 12   | so        | 1953       | saturday      | -429 

In [7]:
# Tokens whose occurence count will act as features
# Selected based on table above
tokens_to_check = ["i", 
                   "you", 
                   "me", 
                   "we", 
                   "my", 
                   "this", 
                   "that", 
                   "it", 
                   "like", \
                   "--", 
                   "-", 
                   "''", 
                   "%", 
                   "said", 
                   "?", 
                   "!", 
                   "’", 
                   ":"]

### Build feature set

In [8]:

lmtzr = nltk.stem.wordnet.WordNetLemmatizer()
sid = SentimentIntensityAnalyzer()

feature_set = []

for i in tqdm(range(reduced_article_db.shape[0]), desc="Building feature set"):
    article = reduced_article_db.iloc[i]
    article_text = article["content"]
    
    article_features = {}
    article_features["article_length"] = len(article_text)
    
    # Get token count in the article
    tokens = nltk.word_tokenize(article_text.lower())
    token_counts = dict(Counter(tokens))

    # Token count features
    for token in tokens_to_check:
        # Get occurence in this article
        occurence = token_counts.get(token, 0)
        article_features[token] = occurence
        
    # Sentiment features
    sentiment = sid.polarity_scores(article_text)
    article_features["positivity"] = sentiment['pos']
    article_features["negativity"] = sentiment['neg']
    article_features["neutral"] = sentiment['neu']
    article_features["compound"] = sentiment['compound']
    
    # Target value
    article_features["blog"] = article["blog"]
    
    feature_set.append(article_features)
    
dataset = pd.DataFrame(feature_set)

Building feature set: 100%|██████████| 10000/10000 [02:00<00:00, 82.74it/s]


### Save dataset

In [11]:
# Save dataset
dataset.to_pickle("data/dataset")

In [12]:
print (dataset.shape)

(10000, 24)
