### Imports

In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import sys
import json
import os
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from terminaltables import AsciiTable
from collections import Counter
from nltk.corpus import wordnet as wn
from collections import defaultdict



### Load dataset

In [2]:
article_db = None

if os.path.exists("data/articles"):
    # Load saved reduced article db
    print ("Loading Article DB...")
    reduced_article_db = pd.read_pickle("data/articles")
    print ("+ Done.")
else:
    # Load full article DB and load as Pandas DF
    lines = []
    with open('data/signalmedia-1m.jsonl', 'r') as f:
        for json_article in tqdm(f.readlines(), desc="Loading articles"):
            article = dict(json.loads(json_article))
            lines.append(article)

    print ("Load as Pandas DF...")
    article_db = pd.DataFrame(lines)
    print ("+ Done.")

    # Remove rows that has NaN values
    article_db.dropna(inplace=True)

Loading Article DB...
+ Done.


### Display column names

In [3]:
columns = article_db.columns if article_db is not None else reduced_article_db.columns
# Show column names
print ("Columns:")
for col in columns:
    print ("+ %s" % col)

Columns:
+ content
+ id
+ media-type
+ published
+ source
+ title
+ blog


### Reduce and balance dataset

In [4]:
def print_media_occ(articles):
    """Count number of occurences of each class and print as a table"""
    num_news = np.sum(articles['blog'] == 0)
    num_blogs = np.sum(articles['blog'] == 1)
    
    table_data = [["Media Type", "Count"]]
    for media_type, count in [("News", num_news), ("Blog", num_blogs)]:
        table_data.append([media_type, count])
    print (AsciiTable(table_data).table)
    
if article_db is not None:
    # One-hot encoding of 'media-type': 'News' = 0, 'Blog = 1
    article_db.loc[article_db['media-type'] == 'News', 'blog'] = 0
    article_db.loc[article_db['media-type'] == 'Blog', 'blog'] = 1

    print_media_occ(article_db)
    
    print ("\nReduce and balance dataset...")
    n_of_each = 3000
    # Pick n_of_each articles of each media-type
    news_articles = article_db[article_db['blog'] == 0].iloc[:n_of_each]
    blog_articles = article_db[article_db['blog'] == 1].iloc[:n_of_each]
    # Save as reduced article dataset
    reduced_article_db = pd.concat([news_articles, blog_articles])
    print ("+ Done.\n")
    
    print_media_occ(reduced_article_db)
    
    reduced_article_db.to_pickle("data/articles")
    

### Helper methods for feature extraction

In [5]:
lmtzr = nltk.stem.wordnet.WordNetLemmatizer()
sid = SentimentIntensityAnalyzer()


# Lemmatizing (the proper way, accounting for different POS tags)
def penn_to_wn(penn_tag):
    """
    Returns the corresponding WordNet POS tag for a Penn TreeBank POS tag.
    """
    if penn_tag in ['NN', 'NNS', 'NNP', 'NNPS']:
        wn_tag = wn.NOUN
    elif penn_tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        wn_tag = wn.VERB
    elif penn_tag in ['RB', 'RBR', 'RBS']:
        wn_tag = wn.ADV
    elif penn_tag in ['JJ', 'JJR', 'JJS']:
        wn_tag = wn.ADJ
    else:
        wn_tag = None
    return wn_tag

def get_lemmas(tokens):
    """Lemmatize the list of tokens"""
    tagged_tokens = nltk.pos_tag(tokens)
    lemmas = []
    for token, pos in tagged_tokens:
        wn_tag = penn_to_wn(pos)
        lemma = lmtzr.lemmatize(token) if not wn_tag else lmtzr.lemmatize(token, wn_tag)
        lemmas.append(lemma)
    return lemmas

def get_token_count(articles):
    """
    Return a collective count for the tokens in the articles.

    :param articles: Pandas DF of articles
    :returns: A count value
    """
    total_count = defaultdict(int)
    for i in tqdm(range(len(articles)), desc="Counting token occurences"):
        text = articles.iloc[i]['content']
        tokens = nltk.word_tokenize(text)
        token_count = dict(Counter(tokens))
        for token, count in token_count.items():
            total_count[token] += count
    return total_count

def sum_token_count(token_counts, tokens_to_check):
    """
    Return the sum of counts for tokens in tokens_to_check.

    :param token_counts: Dictionary with tokens and counts
    :param tokens_to_check: List of tokens
    :returns: A count value
    """
    cnt = 0
    for token in tokens_to_check:
        cnt += token_counts.get(token, 0)
    return cnt

### Display discrepancies of token occurences between distributions

In [6]:
news_articles = reduced_article_db[reduced_article_db['blog'] == 0]
blog_articles = reduced_article_db[reduced_article_db['blog'] == 1]
  
token_count_news = get_token_count(news_articles)
token_count_blogs = get_token_count(blog_articles)

# Get all tokens
tokens = list(token_count_news.keys())
tokens.extend(list(token_count_blogs.keys()))
tokens = set(tokens)

# Get the token occurence difference between news and blog articles
diff_count = defaultdict(int)
for token in tqdm(tokens, desc="Determine occurence diff."):
    diff_count[token] += token_count_blogs.get(token, 0)
    diff_count[token] -= token_count_news.get(token, 0)

# Sort the tokens by the largest occurence diff. and display the 200 where
# the difference was the largest
sorted_diff_count = sorted(diff_count.items(), key=lambda x: x[1], reverse=True)
first_hundred = sorted_diff_count[:100]
last_hundred = sorted_diff_count[-100::-1]

# Display the tokens with the largest difference 
# (idea is that the count of tokens with a large occurence diff will make good features)
table_data = [["Num.", "Token", "Count Diff", "Token", "Count Diff"]]
for i, ((f_token, f_diff), (l_token, l_diff)) in enumerate(zip(first_hundred, last_hundred)):
    table_data.append([i, f_token, f_diff, l_token, l_diff])
print (AsciiTable(table_data).table)


Counting token occurences: 100%|██████████| 3000/3000 [00:16<00:00, 180.62it/s]
Counting token occurences: 100%|██████████| 3000/3000 [00:13<00:00, 220.07it/s]
Determine occurence diff.: 100%|██████████| 128241/128241 [00:00<00:00, 466269.71it/s]

+------+-----------+------------+-------------+------------+
| Num. | Token     | Count Diff | Token       | Count Diff |
+------+-----------+------------+-------------+------------+
| 0    | I         | 5143       | 2014        | -261       |
| 1    | ’         | 4817       | Australia   | -259       |
| 2    | you       | 4630       | customers   | -255       |
| 3    | :         | 3345       | stock       | -254       |
| 4    | !         | 2367       | August      | -249       |
| 5    | s         | 2342       | across      | -248       |
| 6    | your      | 2268       | which       | -245       |
| 7    | it        | 2211       | management  | -241       |
| 8    | ?         | 1799       | b           | -241       |
| 9    | my        | 1718       | she         | -239       |
| 10   | this      | 1435       | says        | -237       |
| 11   | that      | 1430       | Thursday    | -236       |
| 12   | is        | 1114       | Sunday      | -235       |
| 13   | can       | 109




### Build feature set

In [7]:

lmtzr = nltk.stem.wordnet.WordNetLemmatizer()
sid = SentimentIntensityAnalyzer()

def build_feature_set(article):
    """Turn article into feature set on which the model can train on."""
    article_text = article["content"]

    sample = pd.DataFrame()    
    
    # Length features
    sample.loc[0, "article_length"] = len(article_text)
    
    # Get token count in the article
    tokens = nltk.word_tokenize(article_text.lower())
    token_counts = dict(Counter(tokens))
    
    # A list of tokens to check the occurence of in the article
    tokens_to_check = [
        ("blog_tokens", {"i", "you", "me", "we", "my", "mine", "this", "that", "it", "is", "like"}),
        ("news_char_occ", {"--", "-", "``", ",", "''", ";", "%", "said"}),
        ("blog_char_occ", {"?", "!", "’", ":"}),
    ]

    # Sets of special tokens - Chosen by the token count for each article type above
    for label, token_set in tokens_to_check:
        # Get occurence in this article
        occurence = sum_token_count(token_counts, token_set)
        sample.loc[0, label] = occurence
    
    # Sentiment features
    sentiment = sid.polarity_scores(article_text)
    sample.loc[0, "positivity"] = sentiment['pos']
    sample.loc[0, "negativity"] = sentiment['neg']
    sample.loc[0, "neutral"] = sentiment['neu']
    sample.loc[0, "compound"] = sentiment['compound']
    
    # Target value
    sample.loc[0, "blog"] = article["blog"]
    
    return sample.loc[0]


# Progress bar for progress_apply
tqdm.pandas(desc="Building feature set")

# For each article => extract feature set
dataset = reduced_article_db.progress_apply(lambda x: build_feature_set(x), axis=1)


Building feature set: 6001it [02:12, 45.21it/s]                          


### Save dataset

In [8]:
# Save dataset
dataset.to_pickle("data/dataset")