### Imports

In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import sys
import json
import os
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from terminaltables import AsciiTable
from collections import Counter
from nltk.corpus import wordnet as wn
from collections import defaultdict

from nltk.corpus import stopwords
stop_words_en = set(stopwords.words('english'))

PATH_TO_DATA = '../data'

article_db = None
reduced_article_db = None

os.makedirs(PATH_TO_DATA, exist_ok=True)



### Load reduced dataset

In [2]:
if os.path.exists('%s/articles' % PATH_TO_DATA):
    # Load saved reduced article db
    print ("Loading reduced article DB...")
    reduced_article_db = pd.read_pickle('%s/articles' % PATH_TO_DATA)
    print ("+ Done.")

Loading reduced article DB...
+ Done.


### Load full dataset

In [3]:
# Load full article DB
lines = []
with open('%s/signalmedia-1m.jsonl' % PATH_TO_DATA, 'r') as f:
    for json_article in tqdm(f.readlines(), desc='Loading articles'):
        article = dict(json.loads(json_article))
        # Extract article id, content and media type
        lines.append({'id': article['id'], 
                      'content': article['content'], 
                      'media-type': article['media-type']})

print ("Load as Pandas DF...")
article_db = pd.DataFrame(lines)
print ("+ Done.")

# One-hot encoding of 'media-type': 'News' = 0, 'Blog = 1
article_db.loc[article_db['media-type'] == 'News', 'blog'] = 0
article_db.loc[article_db['media-type'] == 'Blog', 'blog'] = 1

# Remove rows that has NaN values
article_db.dropna(inplace=True)

Loading articles: 100%|██████████| 1000000/1000000 [00:58<00:00, 17105.71it/s]


Load as Pandas DF...
+ Done.


### Display column names

In [4]:
# Show column names
print ("Columns:")
for col in article_db.columns:
    print ("+ %s" % col)

Columns:
+ content
+ id
+ media-type
+ blog


### Reduce and balance dataset

In [9]:

def print_media_occ(article_type, articles):
    """Count number of occurences of each class and print as a table"""
    print (article_type)
    num_news = np.sum(articles['blog'] == 0)
    num_blogs = np.sum(articles['blog'] == 1)
    
    table_data = [["Media Type", "Count"]]
    for media_type, count in [("News", num_news), ("Blog", num_blogs)]:
        table_data.append([media_type, count])
    print (AsciiTable(table_data).table)
  

print_media_occ('Article DB', article_db)
print ()

if reduced_article_db is None:
    
    print ("\nReduce and balance dataset...")
    n_of_each = 10000
    news_articles = article_db[article_db['blog'] == 0].iloc[:n_of_each]
    blog_articles = article_db[article_db['blog'] == 1].iloc[:n_of_each]
    reduced_article_db = pd.concat([news_articles, blog_articles])
    print ("+ Done.\n")
    
    print_media_occ(reduced_article_db)
    
    reduced_article_db.to_pickle("data/articles")
    
else:
    print_media_occ('Reduced Article DB', reduced_article_db)
    

Article DB
+------------+--------+
| Media Type | Count  |
+------------+--------+
| News       | 734488 |
| Blog       | 265512 |
+------------+--------+

Reduced Article DB
+------------+-------+
| Media Type | Count |
+------------+-------+
| News       | 10000 |
| Blog       | 10000 |
+------------+-------+


### Helper methods for feature extraction

In [8]:
lmtzr = nltk.stem.wordnet.WordNetLemmatizer()
sid = SentimentIntensityAnalyzer()

# Lemmatizing (the proper way, accounting for different POS tags)
def penn_to_wn(penn_tag):
    """
    Returns the corresponding WordNet POS tag for a Penn TreeBank POS tag.
    """
    if penn_tag in ['NN', 'NNS', 'NNP', 'NNPS']:
        wn_tag = wn.NOUN
    elif penn_tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        wn_tag = wn.VERB
    elif penn_tag in ['RB', 'RBR', 'RBS']:
        wn_tag = wn.ADV
    elif penn_tag in ['JJ', 'JJR', 'JJS']:
        wn_tag = wn.ADJ
    else:
        wn_tag = None
    return wn_tag

def get_lemmas(tokens):
    """Lemmatize the list of tokens"""
    tagged_tokens = nltk.pos_tag(tokens)
    lemmas = []
    for token, pos in tagged_tokens:
        wn_tag = penn_to_wn(pos)
        lemma = lmtzr.lemmatize(token) if not wn_tag else lmtzr.lemmatize(token, wn_tag)
        lemmas.append(lemma)
    return lemmas

def token_count_in_articles(articles):
    """
    Return occurence count of tokens in all articles.

    :param DataFrame articles: Collection of articles
    :returns int: A count value
    """
    total_count = defaultdict(int)
    for i in tqdm(range(len(articles)), desc='Counting token occurences'):
        text = articles.iloc[i]['content'].lower()
        tokens = nltk.word_tokenize(text)
        token_count = get_token_count(tokens)
        for token, count in token_count.items():
            total_count[token] += count
    return total_count

def get_token_count(tokens):
    """
    Count occurences of tokens in text.

    :param list tokens: List of tokens
    :returns dict: Dict of tokens and their count
    """
    token_counts = dict(Counter(tokens))
    return token_counts

def get_spelling_errors(text):
    """
    Count number of spelling errors in text.

    :param str text
    :returns int: Number of spelling errors
    """
    tokens = nltk.word_tokenize(text)
    cnt = 0
    for token in tokens:
        # If spelling error and token is not a stop word
        if not (wn.synsets(token) and token in stop_words_en):
            cnt += 1
    return cnt



### Display discrepancies of token occurences between media types

In [7]:
# Randomly select 40 000 articles of each type and measure token occurence discrepancy

idx = np.random.randint(0, 200000, size=40000)

news_articles = article_db[article_db['media-type'] == 'News'].iloc[idx]
blog_articles = article_db[article_db['media-type'] == 'Blog'].iloc[idx]
  
token_count_news = token_count_in_articles(news_articles)
token_count_blogs = token_count_in_articles(blog_articles)

# Get all tokens
tokens = set(token_count_news.keys())
tokens.update(list(token_count_blogs.keys()))

# Get the token occurence difference between news and blog articles
diff_count = defaultdict(int)
for token in tqdm(tokens, desc='Determine occurence diff.'):
    diff_count[token] += token_count_blogs.get(token, 0)
    diff_count[token] -= token_count_news.get(token, 0)

# Sort the tokens by the largest occurence difference
sorted_diff_count = sorted(diff_count.items(), key=lambda x: x[1], reverse=True)
first_hundred = sorted_diff_count[:100]
last_hundred = sorted_diff_count[-100::-1]

# Display the tokens with the largest difference 
# (idea is that the count of tokens with a large occurence diff will make good features)
print (AsciiTable([['Tokens with largest occurence diff. between media types']]).table)
print ("- Positive values indicate a higher occurence in blogs than news")
table_data = [['Num.', 'Token', 'Count Diff', 'Token', 'Count Diff']]
for i in range(len(first_hundred)):
    f_token, f_diff = first_hundred[i]
    l_token, l_diff = last_hundred[i]
    table_data.append([i, f_token, f_diff, l_token, l_diff])
print (AsciiTable(table_data).table)



Counting token occurences: 100%|██████████| 40000/40000 [03:34<00:00, 186.69it/s]
Counting token occurences: 100%|██████████| 40000/40000 [05:06<00:00, 130.58it/s]
Determine occurence diff.: 100%|██████████| 548931/548931 [00:01<00:00, 341736.67it/s]

+---------------------------------------------------------+
| Tokens with largest occurence diff. between media types |
+---------------------------------------------------------+
- Positive values indicate a higher occurence in blogs than news
+------+-----------+------------+---------------+------------+
| Num. | Token     | Count Diff | Token         | Count Diff |
+------+-----------+------------+---------------+------------+
| 0    | i         | 74279      | markets       | -3313      |
| 1    | you       | 68589      | cent          | -3304      |
| 2    | ’         | 58196      | china         | -3266      |
| 3    | !         | 34868      | former        | -3260      |
| 4    | it        | 33865      | over          | -3254      |
| 5    | :         | 32741      | told          | -3254      |
| 6    | your      | 29974      | securities    | -3246      |
| 7    | my        | 27386      | thursday      | -3242      |
| 8    | s         | 24706      | source        | -3199      |




In [11]:
# Cherry-picked tokens whose occurence count will act as features
tokens_to_check = ['i', 
                   'you', 
                   'me', 
                   'we', 
                   'my', 
                   'this', 
                   'that', 
                   'it', 
                   'like',
                   '--', 
                   '-', 
                   "''", 
                   '%', 
                   'said',
                   'told',
                   '?', 
                   '!', 
                   '’', 
                   ':']

### Build feature set

In [12]:

lmtzr = nltk.stem.wordnet.WordNetLemmatizer()
sid = SentimentIntensityAnalyzer()

feature_set = []
for i in tqdm(range(reduced_article_db.shape[0]), desc='Building feature set'):
    article = reduced_article_db.iloc[i]
    article_text = article['content']
    
    tokens = nltk.word_tokenize(article_text.lower())
    sentences = nltk.sent_tokenize(article_text.lower())
    
    article_features = {}
    article_features['article_length'] = len(article_text)
    # Average token length
    article_features['token_length'] = sum(len(t) for t in tokens) / len(tokens)
    # Spelling errors per token
    article_features['spelling_errors'] = get_spelling_errors(article_text) / len(tokens)
    
    # Get token count in the article
    token_counts = get_token_count(tokens)

    # Token count features
    for token in tokens_to_check:
        article_features[token] = token_counts.get(token, 0)
      
    # Sentiment features, averaged per sentence
    sentiments = {'pos': 0, 'neg': 0, 'neu': 0, 'comp': 0}
    for sentence in sentences:
        sentiment = sid.polarity_scores(sentence)
        sentiments['pos'] += sentiment['pos']
        sentiments['neg'] += sentiment['neg']
        sentiments['neu'] += sentiment['neu']
        sentiments['comp'] += sentiment['compound']
    
    article_features['sent_pos'] = sentiments['pos'] / len(sentences)
    article_features['sent_neg'] = sentiments['neg'] / len(sentences)
    article_features['sent_neu'] = sentiments['neu'] / len(sentences)
    article_features['sent_comp'] = sentiments['comp'] / len(sentences)
    
    # Target value
    article_features['blog'] = article['blog']
    
    feature_set.append(article_features)
    
dataset = pd.DataFrame(feature_set)

Building feature set: 100%|██████████| 20000/20000 [09:52<00:00, 33.75it/s]


### Save dataset

In [13]:
# Save dataset
dataset.to_pickle('%s/dataset' % PATH_TO_DATA)