In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from cleanMDA import extractTable, divide_chunks, pullMDA, getXy
from text_analysis import text_analysis_wordall, text_analysis_sentenceall, get_polarity, low_subjectivity, model_analysis, text_analysis_score
from textblob import TextBlob
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Sentiment Analysis by Word

The following code takes a dataframe of all the top 10 companies from each year and their cleaned MDA text to run analysis on. The function, text_analysis_wordall, runs sentiment analysis on individual words from each year and returns a dataframe that holds a list of all the words in a tuple with their polarity and sentiment score from each year. Along with the percent change of GDP for that year. This dataframe is pickled and used throughout the final report.

In [2]:
# company name and text in dataframe
df_sentences = pd.read_pickle('Cleaned_MDA_sentences.pkl')

In [3]:
# combine the company's text for each year and add the GDP of each year for word analysis
df = getXy(df_sentences) 

In [4]:
# call the function text_analysis_wordall on the MDA text df
analysis_df = text_analysis_wordall(df)
analysis_df

Unnamed: 0,X,y
2007,"[(frequently, 0.1, 0.3), (other, -0.125, 0.375...",1.77857
2008,"[(frequently, 0.1, 0.3), (other, -0.125, 0.375...",-0.291621
2009,"[(frequently, 0.1, 0.3), (other, -0.125, 0.375...",-2.77553
2010,"[(frequently, 0.1, 0.3), (other, -0.125, 0.375...",2.53192
2011,"[(frequently, 0.1, 0.3), (other, -0.125, 0.375...",1.60145
2012,"[(other, -0.125, 0.375), (certain, 0.214285714...",2.22403
2013,"[(other, -0.125, 0.375), (certain, 0.214285714...",1.67733
2014,"[(other, -0.125, 0.375), (certain, 0.214285714...",2.56919
2015,"[(other, -0.125, 0.375), (certain, 0.214285714...",2.86159
2016,"[(other, -0.125, 0.375), (certain, 0.214285714...",1.48528


In [5]:
analysis_df.to_pickle('word_analysis.pkl')

## Sentiment Analysis by Sentence

The following code takes a dataframe of all the top 10 companies from each year and their cleaned MDA text to run analysis on. The function, text_analysis_sentenceall, runs sentiment analysis on individual sentences from each year and returns a dataframe that holds a list of all the sentences in a tuple with their polarity and sentiment score from each year. Along with the percent change of GDP for that year. This dataframe is pickled and used throughout the final report.

In [6]:
# combine the company's text for each year and add the GDP of each year for sentence analysis
df2 = getXy(df_sentences)

In [7]:
# call the function text_analysis_sentenceall on the MDA text df
analysis_sentence = text_analysis_sentenceall(df2)
analysis_sentence

Unnamed: 0,X,y
2007,[( table of index to financial statements fina...,1.77857
2008,[( table of index to financial statements this...,-0.291621
2009,[( table of index to financial statements fina...,-2.77553
2010,[( table of index to financial statements fina...,2.53192
2011,[( table of index to financial statements this...,1.60145
2012,[(management discussion and analysis of financ...,2.22403
2013,[(management discussion and analysis of financ...,1.67733
2014,[(management discussion and analysis of financ...,2.56919
2015,[(management discussion and analysis of financ...,2.86159
2016,[(management discussion and analysis of financ...,1.48528


In [8]:
analysis_sentence.to_pickle('sentence_analysis.pkl')

## Table of the Count of Words

This code creates a dataframe of the number of words throughout the sentiment analysis proccess. The final dataframe created is used as a visual representation in the final report to show the change in word count during the analysis of finding the best model.

In [9]:
# get word count before any analysis was done
words_before = pd.read_pickle('Cleaned_MDA.pkl')
words_before = getXy(words_before)
count_before = np.array(words_before['X'].map(lambda x: len(x.split(' '))))

In [10]:
# get word count after taking out all the polarity = 0 scores
words_during = pd.read_pickle('word_analysis.pkl')
count_during = np.array(words_during['X'].map(lambda x: len(x)))

In [11]:
# get word count from best dataset for model
analysis_word = pd.read_pickle('word_analysis.pkl')
word_polarity_low = low_subjectivity(analysis_word)
count_after = np.array(word_polarity_low['X'].map(lambda x: len(x)))

In [12]:
# create counts df of all the different word counts
counts = pd.DataFrame(index=['Word Count'])
counts['Raw Data'] = sum(count_before)
counts['Dropped Zeros'] = sum(count_during)
counts['Dropped High Subjectivity'] = sum(count_after)

In [13]:
# find the number of words dropped
counts = counts.transpose()
counts['Words Dropped'] = [0,counts['Word Count'][0] - counts['Word Count'][1],
                           counts['Word Count'][1] - counts['Word Count'][2]]

In [14]:
counts.to_pickle('Word_count.pkl')

# Positive and Negative Scores and Frequency

This code creates two dataframes to provide visual representation of the differences between the positive and negative scores. The frequecy score dataframe is the count of positive and negative scores for each year and the sum_score dataframe is the total sum of positive and negative scores for each year along with the total sum. These dataframes are pickled and used in the final report for visualization.

In [15]:
# combine the company's text for each year and add the GDP of each year for word analysis
df_ct = getXy(df_sentences) 

In [16]:
pos_neg = text_analysis_score(df_ct)

In [17]:
# get the positive and negative frequencies
pos_freq = np.array(pos_neg['X'].map(lambda x: x[1]))
neg_freq = np.array(pos_neg['X'].map(lambda x: x[3]))

# get the positive and negative total scores
pos_sum = np.array(pos_neg['X'].map(lambda x: x[0]))
neg_sum = np.array(pos_neg['X'].map(lambda x: x[2]))

# get total scores
wordAnalysis = pd.read_pickle("word_analysis.pkl")
word_polarity = get_polarity(wordAnalysis)
summed_word = np.array(word_polarity['X'].map(lambda x: sum(x)))

In [18]:
# create a frequency dataframe
freq_score = pd.DataFrame()
freq_score['Positive'] = pos_freq
freq_score['Negative'] = neg_freq
freq_score.to_pickle('pos_neg_count.pkl')

In [19]:
# create a frequency dataframe
sum_score = pd.DataFrame()
sum_score['Positive'] = pos_sum
sum_score['Negative'] = neg_sum
sum_score['Total'] = summed_word
sum_score.to_pickle('pos_neg_sum.pkl')