In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from cleanMDA import extractTable, divide_chunks, pullMDA, getXy
from text_analysis import text_analysis_wordall, text_analysis_sentenceall, get_polarity, low_subjectivity, model_analysis
from textblob import TextBlob
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Sentiment Analysis by Word

In [11]:
# company name and text in dataframe
df_sentences = pd.read_pickle('Cleaned_MDA_sentences.pkl')

In [12]:
# combine the company's text for each year and add the GDP of each year for word analysis
df = getXy(df_sentences) 

In [13]:
# call the function text_analysis_wordall on the MDA text df
analysis_df = text_analysis_wordall(df)
analysis_df

Unnamed: 0,X,y
2007,"[(frequently, 0.1, 0.3), (other, -0.125, 0.375...",1.77857
2008,"[(frequently, 0.1, 0.3), (other, -0.125, 0.375...",-0.291621
2009,"[(frequently, 0.1, 0.3), (other, -0.125, 0.375...",-2.77553
2010,"[(frequently, 0.1, 0.3), (other, -0.125, 0.375...",2.53192
2011,"[(frequently, 0.1, 0.3), (other, -0.125, 0.375...",1.60145
2012,"[(other, -0.125, 0.375), (certain, 0.214285714...",2.22403
2013,"[(other, -0.125, 0.375), (certain, 0.214285714...",1.67733
2014,"[(other, -0.125, 0.375), (certain, 0.214285714...",2.56919
2015,"[(other, -0.125, 0.375), (certain, 0.214285714...",2.86159
2016,"[(other, -0.125, 0.375), (certain, 0.214285714...",1.48528


In [14]:
analysis_df.to_pickle('word_analysis.pkl')

## Sentiment Analysis by Sentence

In [197]:
# combine the company's text for each year and add the GDP of each year for sentence analysis
df2 = getXy(df_sentences)

In [199]:
# call the function text_analysis_sentenceall on the MDA text df
analysis_sentence = text_analysis_sentenceall(df2)
analysis_sentence

In [213]:
analysis_sentence.to_pickle('sentence_analysis.pkl')

## Get just the polarity score from the sentence and word analysis

In [3]:
analysis_word = pd.read_pickle('word_analysis.pkl')
analysis_sentence = pd.read_pickle('sentence_analysis.pkl')

In [4]:
# call the function get_polarity on the word analysis df
word_polarity = get_polarity(analysis_word)
word_polarity

Unnamed: 0,X,y
2007,"[0.1, -0.125, -0.2, -0.05, -0.05, -0.125, -0.1...",1.77857
2008,"[0.1, -0.125, -0.2, 0.7, -0.05, -0.05, -0.125,...",-0.291621
2009,"[0.1, -0.125, -0.2, -0.05, -0.05, -0.125, -0.1...",-2.77553
2010,"[0.1, -0.125, -0.2, -0.05, -0.05, -0.125, -0.1...",2.53192
2011,"[0.1, -0.125, -0.2, -0.05, -0.05, -0.125, -0.1...",1.60145
2012,"[-0.125, 0.21428571428571427, 0.1, 0.375, -0.0...",2.22403
2013,"[-0.125, 0.21428571428571427, 0.1, 0.375, -0.0...",1.67733
2014,"[-0.125, 0.21428571428571427, 0.1, 0.375, -0.0...",2.56919
2015,"[-0.125, 0.21428571428571427, 0.1, 0.375, -0.0...",2.86159
2016,"[-0.125, 0.21428571428571427, 0.1, 0.375, -0.0...",1.48528


In [5]:
# call the function get_polarity on the sentence analysis df
sentence_polarity = get_polarity(analysis_sentence)
sentence_polarity

Unnamed: 0,X,y
2007,"[-0.0140625, -0.05, -0.05, -0.125, -0.06666666...",1.77857
2008,"[-0.011842105263157895, 0.7, -0.05, -0.05, -0....",-0.291621
2009,"[-0.0140625, -0.05, -0.05, -0.125, -0.06666666...",-2.77553
2010,"[-0.016071428571428573, -0.05, -0.05, -0.125, ...",2.53192
2011,"[-0.013235294117647059, -0.05, -0.05, -0.125, ...",1.60145
2012,"[-0.041666666666666664, 0.027380952380952377, ...",2.22403
2013,"[-0.041666666666666664, 0.027380952380952377, ...",1.67733
2014,"[-0.041666666666666664, 0.027380952380952377, ...",2.56919
2015,"[-0.041666666666666664, 0.027380952380952377, ...",2.86159
2016,"[-0.041666666666666664, 0.027380952380952377, ...",1.48528


# Model Analysis

### Words

In [6]:
# split Xs into individual columns and run analysis
word_split_Xs = pd.DataFrame(word_polarity.X.tolist())
word_split = model_analysis(np.array(word_split_Xs),np.array(word_polarity['y']))

# get the sum of all the scores for each year and run analysis
summed_word = np.array(word_polarity['X'].map(lambda x: sum(x)))
Sum_word = model_analysis(summed_word,np.array(word_polarity['y']))



### Sentences

In [7]:
# split Xs into individual columns and run analysis
sentences_split_Xs = pd.DataFrame(sentence_polarity.X.tolist())
sentence_split = model_analysis(np.array(sentences_split_Xs),np.array(sentence_polarity['y']))

# get the sum of all the scores for each year and run analysis
summed_sentence = np.array(sentence_polarity['X'].map(lambda x: sum(x)))
Sum_sentence = model_analysis(summed_sentence,np.array(sentence_polarity['y']))



In [18]:
models = [word_split,Sum_word,sentence_split,Sum_sentence]
name = ['word_split','Sum_word','sentence_split','Sum_sentence']
for i in range(len(models)):
    print(name[i],models[i][2])

word_split -14.00501376563017
Sum_word -1.5412611237579816
sentence_split -6.467280047393677
Sum_sentence -1.8563084104948013


# Final Model

Since the best model from the code above we found was the Sum_word, we see if we can fit the model better by editing the parameters of subjectivity and polarity scores

In [19]:
analysis_word = pd.read_pickle('word_analysis.pkl')

In [20]:
# the best parameters of the dataset for the model
word_polarity_low = low_subjectivity(analysis_word)

In [22]:
# get the sum of all the scores for each year
summed_word_low = np.array(word_polarity_low['X'].map(lambda x: sum(x)))
Sum_word_best = model_analysis(summed_word_low,np.array(word_polarity_low['y']))
print(Sum_word_best)
print('Best score: ',Sum_word_best[2])

(XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=-1, nthread=None, objective='reg:linear', random_seed=3,
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=0.7, verbosity=1), XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=-1, nthread=None, objective='reg:linear', random_seed=3,
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=0.7, verbosity=1), 0.48816836575278644, array([2.5680907, 2.2241852], dtype=float32), 0.