In [44]:
import numpy as np
import pandas as pd
import codecs
import pickle
import re
from functools import reduce
from nltk import tokenize
from nltk.corpus import stopwords
from nltk import FreqDist

In [47]:
with open('../data/trump_source_sentiment_df.pkl', 'rb') as handle:
    trump_source = pickle.load(handle)

In [48]:
with open('../data/trump_shares_sentiment_df.pkl', 'rb') as handle:
    trump_shares = pickle.load(handle)

In [49]:
with open('../data/clinton_source_sentiment_df.pkl', 'rb') as handle:
    clinton_source = pickle.load(handle)

In [50]:
with open('../data/clinton_shares_sentiment_df.pkl', 'rb') as handle:
    clinton_shares = pickle.load(handle)

In [51]:
#Combine all dataframes - 'origin' column identifies the original source

frames = [trump_source, trump_shares, clinton_source, clinton_shares]
df = pd.concat(frames)

## Unpacking Nested Features

In [4]:
#Function to unpack nested features from columns containing dictionaries

def unpack_column(df, input_column, key, output_column):
    try:
        df[output_column] = df[input_column].apply(lambda x: x.get(key))
    except:
        df[output_column] = np.NaN
    return df

In [5]:
df = unpack_column(df, 'source', 'uri', 'news_source')
df = unpack_column(df, 'shares', 'facebook', 'facebook_shares')

## Remove Satirical News Sources

In [11]:
satire = ['theonion.com', 'cracked.com']

In [12]:
df = df[~df['news_source'].isin(satire)]

## Word Counts

In [14]:
#Load custom stop words to be used along with NLTK stop words

article_stop_words = pd.read_csv('../settings/stop_words.csv', header=None)
trump_stop_words = pd.read_csv('../settings/trump_title_stop_words.csv', header=None)
clinton_stop_words = pd.read_csv('../settings/clinton_title_stop_words.csv', header=None)

In [15]:
article_stop_words = set(article_stop_words.iloc[:,0])
trump_stop_words = set(trump_stop_words.iloc[:,0])
clinton_stop_words = set(clinton_stop_words.iloc[:,0])

In [16]:
#Create dateTime column to be used with Grouper

df['date_format'] = pd.to_datetime(df['dateTime'])

In [17]:
#Function to create nested dictionary containing word frequencies

def word_frequency(df, origin_column, origin_list, resample_list, dateTime_column, text_column, num_words, stop_words):
    freq_dict = {}
    tokenizer = tokenize.RegexpTokenizer(r'\w+')
    for origin in origin_list:
        freq_dict[origin] = {}
        temp_df = df.loc[df[origin_column] == origin]
        for resample in resample_list:
            freq_dict[origin][resample]={}
            temp_group = temp_df.set_index(dateTime_column).resample(resample).agg({text_column: ' '.join})
            for index, row in temp_group.iterrows():
                freq_dict[origin][resample][index.strftime('%m/%d/%Y')] = {}
                words = tokenizer.tokenize(row[text_column])
                words = [word for word in words if not word.isnumeric()]
                words = [word.lower() for word in words]
                words = [word for word in words if word not in stop_words]
                fdist = FreqDist(words)
                for word, frequency in fdist.most_common(num_words):
                    freq_dict[origin][resample][index.strftime('%m/%d/%Y')][word] = frequency
    return freq_dict

In [18]:
#Create necessary lists and sets to be used in the word_frequency (and later) function(s)

article_origins = ['trump_source', 'trump_shares', 'clinton_source', 'clinton_shares']
trump_origins = ['trump_source', 'trump_shares']
clinton_origins = ['clinton_source', 'clinton_shares']
resamples = ['W-SAT', 'M']
article_stop_words = article_stop_words.union(set(stopwords.words('english')))
trump_stop_words = trump_stop_words.union(set(stopwords.words('english')))
clinton_stop_words = clinton_stop_words.union(set(stopwords.words('english')))

In [19]:
#Create article word frequency nested dictionary

article_words = word_frequency(df, 'origin', trump_origins, resamples, 'date_format', 'body', 20, trump_stop_words)
clinton_article_words = word_frequency(df, 'origin', clinton_origins, resamples, 'date_format', 'body', 20, clinton_stop_words)

In [20]:
#Combine the dictionaries

article_words.update(clinton_article_words)

In [21]:
#Create title word frequency nested dictionary

title_words = word_frequency(df, 'origin', trump_origins, resamples, 'date_format', 'title', 20, trump_stop_words)
clinton_title_words = word_frequency(df, 'origin', clinton_origins, resamples, 'date_format', 'title', 20, clinton_stop_words)

In [22]:
#Combine the dictionaries

title_words.update(clinton_title_words)

## Sentiment Summaries

In [23]:
#Function to create nested dictionary containing mean sentiment scores for each origin and time period (week/month)

def sent_summary(df, origin_column, origin_list, resample_list, dateTime_column, sentiment_column):
    sentiment_dict = {}
    for origin in origin_list:
        sentiment_dict[origin] = {}
        temp_df = df.loc[df[origin_column] == origin]
        for resample in resample_list:
            sentiment_dict[origin][resample]={}
            temp_group = temp_df.set_index(dateTime_column).resample(resample).agg({sentiment_column: 'mean'})
            for index, row in temp_group.iterrows():
                sentiment_dict[origin][resample][index.strftime('%m/%d/%Y')] = row[sentiment_column]
    return sentiment_dict

In [24]:
#Create article mean sentiment nested dictionary

article_sentiment = sent_summary(df, 'origin', article_origins, resamples, 'date_format', 'articleSentiment')

In [25]:
#Create title mean sentiment nested dictionary

title_sentiment = sent_summary(df, 'origin', article_origins, resamples, 'date_format', 'titleSentiment')

## Source Counts

In [26]:
#Function to create nested dictionary containing counts for the 20 news sources that produced the most articles in each group 

def source_counts(df, origin_column, origin_list, resample_list, dateTime_column, source_column, count_column):
    source_dict = {}
    for origin in origin_list:
        source_dict[origin] = {}
        temp_df = df.loc[df[origin_column] == origin]
        for resample in resample_list:
            source_dict[origin][resample]={}
            temp_group = temp_df[[dateTime_column, source_column, count_column]].set_index(dateTime_column).groupby(pd.Grouper(freq = resample))
            temp_series = temp_group.apply(lambda x: x.groupby(source_column).count().sort_values([count_column], ascending=False).head(20).apply(list).to_dict())
            for index, item in temp_series.iteritems():
                source_dict[origin][resample][index.strftime('%m/%d/%Y')]={}
                for k1, v1 in item.items():
                    for k2, v2 in v1.items():
                        source_dict[origin][resample][index.strftime('%m/%d/%Y')][k2] = v2
    return source_dict

In [27]:
#Create source count nested dictionary

sources = source_counts(df, 'origin', article_origins, resamples, 'date_format', 'news_source', 'title')

## Mean Share Counts

In [28]:
#Function to create nested dictionary containing mean shares for the 20 news sources that produced the most mean shares in each group 

def mean_share_counts(df, origin_column, origin_list, resample_list, dateTime_column, source_column, count_column):
    shares_dict = {}
    for origin in origin_list:
        shares_dict[origin] = {}
        temp_df = df.loc[df[origin_column] == origin]
        for resample in resample_list:
            shares_dict[origin][resample]={}
            temp_group = temp_df[[dateTime_column, source_column, count_column]].set_index(dateTime_column).groupby(pd.Grouper(freq = resample))
            temp_series = temp_group.apply(lambda x: x.groupby(source_column).mean().sort_values([count_column], ascending=False).head(20).apply(list).to_dict())
            for index, item in temp_series.iteritems():
                shares_dict[origin][resample][index.strftime('%m/%d/%Y')]={}
                for k1, v1 in item.items():
                    for k2, v2 in v1.items():
                        shares_dict[origin][resample][index.strftime('%m/%d/%Y')][k2] = v2
    return shares_dict

In [29]:
#Create share count nested dictionary

mean_shares = mean_share_counts(df, 'origin', article_origins, resamples, 'date_format', 'news_source', 'facebook_shares')

## Total Share Counts

In [30]:
#Function to create nested dictionary containing total shares for the 20 news sources that produced the most total shares in each group 

def total_share_counts(df, origin_column, origin_list, resample_list, dateTime_column, source_column, count_column):
    shares_dict = {}
    for origin in origin_list:
        shares_dict[origin] = {}
        temp_df = df.loc[df[origin_column] == origin]
        for resample in resample_list:
            shares_dict[origin][resample]={}
            temp_group = temp_df[[dateTime_column, source_column, count_column]].set_index(dateTime_column).groupby(pd.Grouper(freq = resample))
            temp_series = temp_group.apply(lambda x: x.groupby(source_column).sum().sort_values([count_column], ascending=False).head(20).apply(list).to_dict())
            for index, item in temp_series.iteritems():
                shares_dict[origin][resample][index.strftime('%m/%d/%Y')]={}
                for k1, v1 in item.items():
                    for k2, v2 in v1.items():
                        shares_dict[origin][resample][index.strftime('%m/%d/%Y')][k2] = v2
    return shares_dict

In [31]:
#Create share count nested dictionary

total_shares = total_share_counts(df, 'origin', article_origins, resamples, 'date_format', 'news_source', 'facebook_shares')

## Share Histograms

In [32]:
from datetime import datetime, timedelta

In [33]:
df['date_format_date'] = df['date_format'].dt.date

In [34]:
df_histogram = df[['date_format_date', 'origin', 'title', 'news_source', 'facebook_shares', 'articleSentiment', 'titleSentiment']].copy().set_index('date_format_date')

In [35]:
df_histogram.insert(0, 'row_id', range(0, len(df_histogram)))

## Export Data for Use in App

In [36]:
with open('../data/article_wordcount_dict.pkl', 'wb') as f:
    pickle.dump(article_words, f, pickle.HIGHEST_PROTOCOL)   

In [37]:
with open('../data/title_wordcount_dict.pkl', 'wb') as f:
    pickle.dump(title_words, f, pickle.HIGHEST_PROTOCOL)   

In [38]:
with open('../data/article_sentiment_dict.pkl', 'wb') as f:
    pickle.dump(article_sentiment, f, pickle.HIGHEST_PROTOCOL)   

In [39]:
with open('../data/title_sentiment_dict.pkl', 'wb') as f:
    pickle.dump(title_sentiment, f, pickle.HIGHEST_PROTOCOL)   

In [40]:
with open('../data/source_count_dict.pkl', 'wb') as f:
    pickle.dump(sources, f, pickle.HIGHEST_PROTOCOL)   

In [41]:
with open('../data/mean_share_count_dict.pkl', 'wb') as f:
    pickle.dump(mean_shares, f, pickle.HIGHEST_PROTOCOL)

In [42]:
with open('../data/total_share_count_dict.pkl', 'wb') as f:
    pickle.dump(total_shares, f, pickle.HIGHEST_PROTOCOL)

In [43]:
with open('../data/df_histogram.pkl', 'wb') as f:
    pickle.dump(df_histogram, f, pickle.HIGHEST_PROTOCOL)