In [1]:
import numpy as np
import pandas as pd
import codecs
import pickle
import re
from functools import reduce
import plotly.plotly as py
import plotly.graph_objs as go
from nltk import tokenize
from nltk.corpus import stopwords
from nltk import FreqDist

In [3]:
with open('../data/sentiment_df.pkl', 'rb') as handle:
    df = pickle.load(handle)

## Unpacking Nested Features

In [4]:
#Function to unpack nested features from columns containing dictionaries

def unpack_column(df, input_column, key, output_column):
    try:
        df[output_column] = df[input_column].apply(lambda x: x.get(key))
    except:
        df[output_column] = np.NaN
    return df

In [5]:
df = unpack_column(df, 'source', 'uri', 'news_source')
df = unpack_column(df, 'shares', 'facebook', 'facebook_shares')

## Word Counts

In [6]:
#Load custom stop words to be used along with NLTK stop words

article_stop_words = pd.read_csv('../settings/stop_words.csv', header=None)
trump_stop_words = pd.read_csv('../settings/trump_title_stop_words.csv', header=None)
clinton_stop_words = pd.read_csv('../settings/clinton_title_stop_words.csv', header=None)

In [8]:
article_stop_words = set(article_stop_words.iloc[:,0])
trump_stop_words = set(trump_stop_words.iloc[:,0])
clinton_stop_words = set(clinton_stop_words.iloc[:,0])

In [9]:
#Create dateTime column to be used with Grouper

df['date_format'] = pd.to_datetime(df['dateTime'])

In [10]:
#Function to create nested dictionary containing word frequencies

def word_frequency(df, origin_column, origin_list, resample_list, dateTime_column, text_column, num_words, stop_words):
    freq_dict = {}
    tokenizer = tokenize.RegexpTokenizer(r'\w+')
    for origin in origin_list:
        freq_dict[origin] = {}
        temp_df = df.loc[df[origin_column] == origin]
        for resample in resample_list:
            freq_dict[origin][resample]={}
            temp_group = temp_df.set_index(dateTime_column).resample(resample).agg({text_column: ' '.join})
            for index, row in temp_group.iterrows():
                freq_dict[origin][resample][index.strftime('%m/%d/%Y')] = {}
                words = tokenizer.tokenize(row[text_column])
                words = [word for word in words if not word.isnumeric()]
                words = [word.lower() for word in words]
                words = [word for word in words if word not in stop_words]
                fdist = FreqDist(words)
                for word, frequency in fdist.most_common(num_words):
                    freq_dict[origin][resample][index.strftime('%m/%d/%Y')][word] = frequency
    return freq_dict

In [11]:
#Create necessary lists and sets to be used in the word_frequency (and later) function(s)

article_origins = ['trump_source', 'trump_shares', 'clinton_source', 'clinton_shares']
trump_origins = ['trump_source', 'trump_shares']
clinton_origins = ['clinton_source', 'clinton_shares']
resamples = ['W-SAT', 'M']
article_stop_words = article_stop_words.union(set(stopwords.words('english')))
trump_stop_words = trump_stop_words.union(set(stopwords.words('english')))
clinton_stop_words = clinton_stop_words.union(set(stopwords.words('english')))

In [62]:
#Create article word frequency nested dictionary

article_words = word_frequency(df, 'origin', trump_origins, resamples, 'date_format', 'body', 20, trump_stop_words)
clinton_article_words = word_frequency(df, 'origin', clinton_origins, resamples, 'date_format', 'body', 20, clinton_stop_words)

In [63]:
#Combine the dictionaries

article_words.update(clinton_article_words)

In [13]:
#Create title word frequency nested dictionary

title_words = word_frequency(df, 'origin', trump_origins, resamples, 'date_format', 'title', 20, trump_stop_words)
clinton_title_words = word_frequency(df, 'origin', clinton_origins, resamples, 'date_format', 'title', 20, clinton_stop_words)

In [14]:
#Combine the dictionaries

title_words.update(clinton_title_words)

## Sentiment Summaries

In [15]:
#Function to create nested dictionary containing mean sentiment scores for each origin and time period (week/month)

def sent_summary(df, origin_column, origin_list, resample_list, dateTime_column, sentiment_column):
    sentiment_dict = {}
    for origin in origin_list:
        sentiment_dict[origin] = {}
        temp_df = df.loc[df[origin_column] == origin]
        for resample in resample_list:
            sentiment_dict[origin][resample]={}
            temp_group = temp_df.set_index(dateTime_column).resample(resample).agg({sentiment_column: 'mean'})
            for index, row in temp_group.iterrows():
                sentiment_dict[origin][resample][index.strftime('%m/%d/%Y')] = row[sentiment_column]
    return sentiment_dict

In [17]:
#Create article mean sentiment nested dictionary

article_sentiment = sent_summary(df, 'origin', article_origins, resamples, 'date_format', 'articleSentiment')

In [18]:
#Create title mean sentiment nested dictionary

title_sentiment = sent_summary(df, 'origin', article_origins, resamples, 'date_format', 'titleSentiment')

## Source Counts

In [19]:
#Function to create nested dictionary containing counts for the 20 news sources that produced the most articles in each group 

def source_counts(df, origin_column, origin_list, resample_list, dateTime_column, source_column, count_column):
    source_dict = {}
    for origin in origin_list:
        source_dict[origin] = {}
        temp_df = df.loc[df[origin_column] == origin]
        for resample in resample_list:
            source_dict[origin][resample]={}
            temp_group = temp_df[[dateTime_column, source_column, count_column]].set_index(dateTime_column).groupby(pd.Grouper(freq = resample))
            temp_series = temp_group.apply(lambda x: x.groupby(source_column).count().sort_values([count_column], ascending=False).head(20).apply(list).to_dict())
            for index, item in temp_series.iteritems():
                source_dict[origin][resample][index.strftime('%m/%d/%Y')]={}
                for k1, v1 in item.items():
                    for k2, v2 in v1.items():
                        source_dict[origin][resample][index.strftime('%m/%d/%Y')][k2] = v2
    return source_dict

In [20]:
#Create source count nested dictionary

sources = source_counts(df, 'origin', article_origins, resamples, 'date_format', 'news_source', 'title')

## Share Counts

In [57]:
def share_counts(df, origin_column, origin_list, resample_list, dateTime_column, source_column, count_column):
    shares_dict = {}
    for origin in origin_list:
        shares_dict[origin] = {}
        temp_df = df.loc[df[origin_column] == origin]
        for resample in resample_list:
            shares_dict[origin][resample]={}
            temp_group = temp_df[[dateTime_column, source_column, count_column]].set_index(dateTime_column).groupby(pd.Grouper(freq = resample))
            temp_group = temp_group.apply(lambda x: x.groupby(source_column).sum())
            temp_series = temp_group.sort_values([count_column], ascending=False).head(20).apply(list).to_dict()
            for index, item in temp_series.iteritems():
                shares_dict[origin][resample][index.strftime('%m/%d/%Y')]={}
                for k1, v1 in item.items():
                    for k2, v2 in v1.items():
                        shares_dict[origin][resample][index.strftime('%m/%d/%Y')][k2] = v2
    return shares_dict

In [58]:
shares = share_counts(df, 'origin', article_origins, resamples, 'date_format', 'news_source', 'facebook_shares')

ValueError: no results

## Export Data for Use in App

In [64]:
with open('../data/article_wordcount_dict.pkl', 'wb') as f:
        pickle.dump(article_words, f, pickle.HIGHEST_PROTOCOL)   

In [24]:
with open('../data/title_wordcount_dict.pkl', 'wb') as f:
        pickle.dump(title_words, f, pickle.HIGHEST_PROTOCOL)   

In [25]:
with open('../data/article_sentiment_dict.pkl', 'wb') as f:
        pickle.dump(article_sentiment, f, pickle.HIGHEST_PROTOCOL)   

In [32]:
with open('../data/title_sentiment_dict.pkl', 'wb') as f:
        pickle.dump(title_sentiment, f, pickle.HIGHEST_PROTOCOL)   

In [27]:
with open('../data/source_count_dict.pkl', 'wb') as f:
        pickle.dump(sources, f, pickle.HIGHEST_PROTOCOL)   

### Testing - compare to original data

In [None]:
df_test = pd.DataFrame()

In [None]:
with open('../data/Trump_2016-01-03_page_3_by_Source.pkl', 'rb') as handle:
    temp_data = pickle.load(handle)
temp_df = pd.DataFrame.from_dict(reduce(dict.get, ['articles', 'results'], temp_data))
df_test = pd.concat([df_test, temp_df])

In [None]:
len(df_test)

In [None]:
df_test = unpack_column(df_test, 'source', 'uri', 'news_source')

In [None]:
pd.Series(df_test.news_source.ravel()).dropna().value_counts()

## Plotly Plot Testing

In [None]:
clinton_source['date_format'] = pd.to_datetime(clinton_source['dateTime'])
clinton_shares['date_format'] = pd.to_datetime(clinton_shares['dateTime'])
trump_source['date_format'] = pd.to_datetime(trump_source['dateTime'])
trump_shares['date_format'] = pd.to_datetime(trump_shares['dateTime'])

In [None]:
df_grouped = df.set_index('date_format').resample('W')['articleSentiment'].mean()#.set_index('date')

In [None]:
clinton_source_grouped.values

In [None]:
trace0 = go.Scatter(
            x = list(sentiment_summaries['clinton_source']['W'].keys()),
            y = list(sentiment_summaries['clinton_source']['W'].values()),
            mode = 'lines',
            name = 'Clinton Source')
trace1 = go.Scatter(
            x = list(sentiment_summaries['clinton_shares']['W'].keys()),
            y = list(sentiment_summaries['clinton_shares']['W'].values()),
            mode = 'lines',
            name = 'Clinton Shares')
trace2 = go.Scatter(
            x = list(sentiment_summaries['trump_source']['W'].keys()),
            y = list(sentiment_summaries['trump_source']['W'].values()),
            mode = 'lines',
            name = 'Trump Source')
trace3 = go.Scatter(
            x = trump_shares_grouped.index,
            y = trump_shares_grouped.values,
            mode = 'lines',
            name = 'Trump Shares')
data = [trace0, trace1, trace2, trace3]
py.iplot(data, filename = 'lines')

In [31]:
trump_clinton = ['clinton_source', 'clinton_shares', 'trump_source', 'trump_shares']
week_month = 'M'
traces = []
for i in trump_clinton:
    traces.append(go.Scatter(
        x = list(title_sentiment[i][week_month].keys()),
        y = list(title_sentiment[i][week_month].values()),
        mode = 'lines',
        name = i)
    )
data = traces
py.iplot(data, filename = 'lines')

In [None]:
trump_shares_grouped = df.loc[df['origin'] == 'trump_shares'].set_index('date_format').resample('W').agg({'articleSentiment': 'mean'})
trump_source_grouped = df.loc[df['origin'] == 'trump_source'].set_index('date_format').resample('W').agg({'articleSentiment': 'mean'})
clinton_shares_grouped = df.loc[df['origin'] == 'clinton_shares'].set_index('date_format').resample('W').agg({'articleSentiment': 'mean'})
clinton_source_grouped = df.loc[df['origin'] == 'clinton_source'].set_index('date_format').resample('W').agg({'articleSentiment': 'mean'})

In [None]:
trace0 = go.Scatter(
            x = clinton_source_grouped.index,
            y = clinton_source_grouped.values,
            mode = 'lines',
            name = 'Clinton Source')
trace1 = go.Scatter(
            x = clinton_shares_grouped.index,
            y = clinton_shares_grouped.values,
            mode = 'lines',
            name = 'Clinton Shares')
trace2 = go.Scatter(
            x = trump_source_grouped.index,
            y = trump_source_grouped.values,
            mode = 'lines',
            name = 'Trump Source')
trace3 = go.Scatter(
            x = trump_shares_grouped.index,
            y = trump_shares_grouped.values,
            mode = 'lines',
            name = 'Trump Shares')
data = [trace0, trace1, trace2, trace3]
py.iplot(data, filename = 'lines')

In [None]:
data = [go.Bar(
            x = list(title_words['clinton_shares']['W']['10/16/2016'].keys()),
            y = list(title_words['clinton_shares']['W']['10/16/2016'].values())
        )]
py.iplot(data, filename = 'bars')

In [None]:
data = [go.Bar(
            x = list(title_words['trump_source']['W']['10/16/2016'].keys()),
            y = list(title_words['trump_source']['W']['10/16/2016'].values())
        )]
py.iplot(data, filename = 'bars')

In [None]:
pd.Series(trump_shares.news_source.ravel()).value_counts()

In [38]:
import json


In [43]:
test_list = ['trump', 'clinton', 'trump_clinton']

In [45]:
test_dict = {}
for i, obj in enumerate(test_list):
    test_dict[i] = obj

In [52]:
test_json = json.dumps(test_list)

In [53]:
test_json[0]

'['

In [55]:
test_unload = json.loads(test_json)

In [56]:
test_unload[0]

'trump'