In [4]:
import numpy as np
import pandas as pd
import codecs
import pickle
import re
from functools import reduce
import plotly.plotly as py
import plotly.graph_objs as go
from nltk import tokenize
from nltk.corpus import stopwords
from nltk import FreqDist

In [5]:
with open('../data/sentiment_df.pkl', 'rb') as handle:
    df = pickle.load(handle)

## Unpacking Nested Features

In [6]:
#Function to unpack nested features from columns containing dictionaries

def unpack_column(df, input_column, key, output_column):
    try:
        df[output_column] = df[input_column].apply(lambda x: x.get(key))
    except:
        df[output_column] = np.NaN
    return df

In [7]:
df = unpack_column(df, 'source', 'uri', 'news_source')
df = unpack_column(df, 'shares', 'facebook', 'facebook_shares')

## Word Counts

In [8]:
#Load custom stop words to be used along with NLTK stop words

article_stop_words = pd.read_csv('../settings/stop_words.csv', header=None)
trump_stop_words = pd.read_csv('../settings/trump_title_stop_words.csv', header=None)
clinton_stop_words = pd.read_csv('../settings/clinton_title_stop_words.csv', header=None)

In [9]:
article_stop_words = set(article_stop_words.iloc[:,0])
trump_stop_words = set(trump_stop_words.iloc[:,0])
clinton_stop_words = set(clinton_stop_words.iloc[:,0])

In [10]:
#Create dateTime column to be used with Grouper

df['date_format'] = pd.to_datetime(df['dateTime'])

In [11]:
#Function to create nested dictionary containing word frequencies

def word_frequency(df, origin_column, origin_list, resample_list, dateTime_column, text_column, num_words, stop_words):
    freq_dict = {}
    tokenizer = tokenize.RegexpTokenizer(r'\w+')
    for origin in origin_list:
        freq_dict[origin] = {}
        temp_df = df.loc[df[origin_column] == origin]
        for resample in resample_list:
            freq_dict[origin][resample]={}
            temp_group = temp_df.set_index(dateTime_column).resample(resample).agg({text_column: ' '.join})
            for index, row in temp_group.iterrows():
                freq_dict[origin][resample][index.strftime('%m/%d/%Y')] = {}
                words = tokenizer.tokenize(row[text_column])
                words = [word for word in words if not word.isnumeric()]
                words = [word.lower() for word in words]
                words = [word for word in words if word not in stop_words]
                fdist = FreqDist(words)
                for word, frequency in fdist.most_common(num_words):
                    freq_dict[origin][resample][index.strftime('%m/%d/%Y')][word] = frequency
    return freq_dict

In [12]:
#Create necessary lists and sets to be used in the word_frequency (and later) function(s)

article_origins = ['trump_source', 'trump_shares', 'clinton_source', 'clinton_shares']
trump_origins = ['trump_source', 'trump_shares']
clinton_origins = ['clinton_source', 'clinton_shares']
resamples = ['W-SAT', 'M']
article_stop_words = article_stop_words.union(set(stopwords.words('english')))
trump_stop_words = trump_stop_words.union(set(stopwords.words('english')))
clinton_stop_words = clinton_stop_words.union(set(stopwords.words('english')))

In [13]:
#Create article word frequency nested dictionary

article_words = word_frequency(df, 'origin', trump_origins, resamples, 'date_format', 'body', 20, trump_stop_words)
clinton_article_words = word_frequency(df, 'origin', clinton_origins, resamples, 'date_format', 'body', 20, clinton_stop_words)

In [14]:
#Combine the dictionaries

article_words.update(clinton_article_words)

In [15]:
#Create title word frequency nested dictionary

title_words = word_frequency(df, 'origin', trump_origins, resamples, 'date_format', 'title', 20, trump_stop_words)
clinton_title_words = word_frequency(df, 'origin', clinton_origins, resamples, 'date_format', 'title', 20, clinton_stop_words)

In [16]:
#Combine the dictionaries

title_words.update(clinton_title_words)

## Sentiment Summaries

In [17]:
#Function to create nested dictionary containing mean sentiment scores for each origin and time period (week/month)

def sent_summary(df, origin_column, origin_list, resample_list, dateTime_column, sentiment_column):
    sentiment_dict = {}
    for origin in origin_list:
        sentiment_dict[origin] = {}
        temp_df = df.loc[df[origin_column] == origin]
        for resample in resample_list:
            sentiment_dict[origin][resample]={}
            temp_group = temp_df.set_index(dateTime_column).resample(resample).agg({sentiment_column: 'mean'})
            for index, row in temp_group.iterrows():
                sentiment_dict[origin][resample][index.strftime('%m/%d/%Y')] = row[sentiment_column]
    return sentiment_dict

In [18]:
#Create article mean sentiment nested dictionary

article_sentiment = sent_summary(df, 'origin', article_origins, resamples, 'date_format', 'articleSentiment')

In [19]:
#Create title mean sentiment nested dictionary

title_sentiment = sent_summary(df, 'origin', article_origins, resamples, 'date_format', 'titleSentiment')

## Source Counts

In [20]:
#Function to create nested dictionary containing counts for the 20 news sources that produced the most articles in each group 

def source_counts(df, origin_column, origin_list, resample_list, dateTime_column, source_column, count_column):
    source_dict = {}
    for origin in origin_list:
        source_dict[origin] = {}
        temp_df = df.loc[df[origin_column] == origin]
        for resample in resample_list:
            source_dict[origin][resample]={}
            temp_group = temp_df[[dateTime_column, source_column, count_column]].set_index(dateTime_column).groupby(pd.Grouper(freq = resample))
            temp_series = temp_group.apply(lambda x: x.groupby(source_column).count().sort_values([count_column], ascending=False).head(20).apply(list).to_dict())
            for index, item in temp_series.iteritems():
                source_dict[origin][resample][index.strftime('%m/%d/%Y')]={}
                for k1, v1 in item.items():
                    for k2, v2 in v1.items():
                        source_dict[origin][resample][index.strftime('%m/%d/%Y')][k2] = v2
    return source_dict

In [21]:
#Create source count nested dictionary

sources = source_counts(df, 'origin', article_origins, resamples, 'date_format', 'news_source', 'title')

## Mean Share Counts

In [22]:
#Function to create nested dictionary containing mean shares for the 20 news sources that produced the most mean shares in each group 

def mean_share_counts(df, origin_column, origin_list, resample_list, dateTime_column, source_column, count_column):
    shares_dict = {}
    for origin in origin_list:
        shares_dict[origin] = {}
        temp_df = df.loc[df[origin_column] == origin]
        for resample in resample_list:
            shares_dict[origin][resample]={}
            temp_group = temp_df[[dateTime_column, source_column, count_column]].set_index(dateTime_column).groupby(pd.Grouper(freq = resample))
            temp_series = temp_group.apply(lambda x: x.groupby(source_column).mean().sort_values([count_column], ascending=False).head(20).apply(list).to_dict())
            for index, item in temp_series.iteritems():
                shares_dict[origin][resample][index.strftime('%m/%d/%Y')]={}
                for k1, v1 in item.items():
                    for k2, v2 in v1.items():
                        shares_dict[origin][resample][index.strftime('%m/%d/%Y')][k2] = v2
    return shares_dict

In [23]:
#Create share count nested dictionary

mean_shares = mean_share_counts(df, 'origin', article_origins, resamples, 'date_format', 'news_source', 'facebook_shares')

## Total Share Counts

In [24]:
#Function to create nested dictionary containing total shares for the 20 news sources that produced the most total shares in each group 

def total_share_counts(df, origin_column, origin_list, resample_list, dateTime_column, source_column, count_column):
    shares_dict = {}
    for origin in origin_list:
        shares_dict[origin] = {}
        temp_df = df.loc[df[origin_column] == origin]
        for resample in resample_list:
            shares_dict[origin][resample]={}
            temp_group = temp_df[[dateTime_column, source_column, count_column]].set_index(dateTime_column).groupby(pd.Grouper(freq = resample))
            temp_series = temp_group.apply(lambda x: x.groupby(source_column).sum().sort_values([count_column], ascending=False).head(20).apply(list).to_dict())
            for index, item in temp_series.iteritems():
                shares_dict[origin][resample][index.strftime('%m/%d/%Y')]={}
                for k1, v1 in item.items():
                    for k2, v2 in v1.items():
                        shares_dict[origin][resample][index.strftime('%m/%d/%Y')][k2] = v2
    return shares_dict

In [25]:
#Create share count nested dictionary

total_shares = total_share_counts(df, 'origin', article_origins, resamples, 'date_format', 'news_source', 'facebook_shares')

## Share Histograms

In [26]:
from datetime import datetime, timedelta

In [27]:
df['date_format_date'] = df['date_format'].dt.date

In [28]:
df_histogram = df[['date_format_date', 'origin', 'title', 'news_source', 'facebook_shares', 'articleSentiment', 'titleSentiment']].copy().set_index('date_format_date')

In [29]:
df_histogram.insert(0, 'row_id', range(0, len(df_histogram)))

## Export Data for Use in App

In [30]:
with open('../data/article_wordcount_dict.pkl', 'wb') as f:
    pickle.dump(article_words, f, pickle.HIGHEST_PROTOCOL)   

In [31]:
with open('../data/title_wordcount_dict.pkl', 'wb') as f:
    pickle.dump(title_words, f, pickle.HIGHEST_PROTOCOL)   

In [32]:
with open('../data/article_sentiment_dict.pkl', 'wb') as f:
    pickle.dump(article_sentiment, f, pickle.HIGHEST_PROTOCOL)   

In [33]:
with open('../data/title_sentiment_dict.pkl', 'wb') as f:
    pickle.dump(title_sentiment, f, pickle.HIGHEST_PROTOCOL)   

In [34]:
with open('../data/source_count_dict.pkl', 'wb') as f:
    pickle.dump(sources, f, pickle.HIGHEST_PROTOCOL)   

In [35]:
with open('../data/mean_share_count_dict.pkl', 'wb') as f:
    pickle.dump(mean_shares, f, pickle.HIGHEST_PROTOCOL)

In [36]:
with open('../data/total_share_count_dict.pkl', 'wb') as f:
    pickle.dump(total_shares, f, pickle.HIGHEST_PROTOCOL)

In [37]:
with open('../data/df_histogram.pkl', 'wb') as f:
    pickle.dump(df_histogram, f, pickle.HIGHEST_PROTOCOL)

In [38]:
mean_shares

{'trump_source': {'W-SAT': {'01/09/2016': {'usnews.com': 836.8181818181819,
    'thehill.com': 752.2380952380952,
    'washingtonpost.com': 542.2432432432432,
    'nbcnews.com': 431.3529411764706,
    'nytimes.com': 419.62068965517244,
    'usatoday.com': 341.6190476190476,
    'huffingtonpost.com': 254.15384615384616,
    'time.com': 210.9090909090909,
    'bloomberg.com': 206.5,
    'foxnews.com': 159.4,
    'thedailybeast.com': 101.0,
    'cbsnews.com': 85.0,
    'wsj.com': 76.75,
    'nypost.com': 72.33333333333333,
    'news.yahoo.com': 65.16666666666667,
    'cnbc.com': 13.0},
   '01/16/2016': {'nytimes.com': 2251.6315789473683,
    'usatoday.com': 414.8333333333333,
    'thehill.com': 409.5925925925926,
    'bloomberg.com': 272.25,
    'foxnews.com': 239.57142857142858,
    'nypost.com': 151.33333333333334,
    'washingtonpost.com': 133.08928571428572,
    'time.com': 132.57142857142858,
    'thedailybeast.com': 125.0,
    'huffingtonpost.com': 108.04347826086956,
    'nbcnews.c