In [2]:
import pandas as pd
import numpy as np
from collections import defaultdict
from cleanMDA import extractTable, divide_chunks, pullMDA, getXy
from text_analysis import text_analysis_wordall, text_analysis_sentenceall, get_polarity, low_subjectivity, model_analysis
from textblob import TextBlob
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Sentiment Analysis by Word

In [11]:
# company name and text in dataframe
df_sentences = pd.read_pickle('Cleaned_MDA_sentences.pkl')

In [3]:
# function to get GDP for 2007-2017 (import from GitHut)
def get_GDP_visual(df):
    USA = np.array(df[df['Data Source'] == 'United States'])

    gdp_USA = pd.DataFrame(USA, columns=['Country Name', 'Country COde', 'Indicator Name', 'Indicator Code',
                                                 '1960','1961','1962','1963','1964','1965','1966','1967','1968','1969',
                                                 '1970','1971','1972','1973','1974','1975','1976','1977','1978','1979',
                                                 '1980','1981','1982','1983','1984','1985','1986','1987','1988','1989',
                                                 '1990','1991','1992','1993','1994','1995','1996','1997','1998','1999',
                                                 '2000','2001','2002','2003','2004','2005','2006','2007','2008','2009',
                                                 '2010','2011','2012','2013','2014','2015','2016','2017','2018'])

    gdp_USA = gdp_USA.drop(columns=['Country Name','Country COde','Indicator Name','Indicator Code','1960','1961','1962',
                                   '1963','1964','1965','1966','1967','1968','1969', '1970','1971','1972','1973','1974',
                                   '1975','1976','1977','1978','1979', '1980','1981','1982','1983','1984','1985','1986',
                                   '1987','1988','1989', '1990','1991','1992','1993','1994','1995','1996','1997','1998',
                                   '1999', '2000','2001','2002','2003','2004','2005','2006','2018'])

    gdp_USA_trans = gdp_USA.transpose()
    gdp_USA_trans = gdp_USA_trans.rename(columns={0:'Y'})
    
    return gdp_USA_trans

In [4]:
gdp_df = pd.read_csv('gdp_annual.csv')
gdp_USA = get_GDP_visual(gdp_df)

In [13]:
# call the function text_analysis_wordall on the MDA text df
analysis_df = text_analysis_wordall(df)
analysis_df

Unnamed: 0,X,y
2007,"[(frequently, 0.1, 0.3), (other, -0.125, 0.375...",1.77857
2008,"[(frequently, 0.1, 0.3), (other, -0.125, 0.375...",-0.291621
2009,"[(frequently, 0.1, 0.3), (other, -0.125, 0.375...",-2.77553
2010,"[(frequently, 0.1, 0.3), (other, -0.125, 0.375...",2.53192
2011,"[(frequently, 0.1, 0.3), (other, -0.125, 0.375...",1.60145
2012,"[(other, -0.125, 0.375), (certain, 0.214285714...",2.22403
2013,"[(other, -0.125, 0.375), (certain, 0.214285714...",1.67733
2014,"[(other, -0.125, 0.375), (certain, 0.214285714...",2.56919
2015,"[(other, -0.125, 0.375), (certain, 0.214285714...",2.86159
2016,"[(other, -0.125, 0.375), (certain, 0.214285714...",1.48528


In [6]:
def text_analysis_sentence(text, subjective=True):
    pos = []
    neg = []
    # split for each sentence
    for word in text.split(' . '):
        blob = TextBlob(word)
        sent = blob.sentiment
        # eliminate words with high subjectivity
#         if subjective:
#             if sent.subjectivity < 0.7:
#                 if sent.polarity > 0.25:
#                     pos.append(sent.polarity)
#                 elif sent.polarity < -0.25:
#                     neg.append(sent.polarity)
        
    return word,sent.polarity,sent.subjectivity

In [7]:
## Sentiment Analysis by Sentence

In [197]:
# combine the company's text for each year and add the GDP of each year for sentence analysis
df2 = getXy(df_sentences)

In [199]:
# call the function text_analysis_sentenceall on the MDA text df
analysis_sentence = text_analysis_sentenceall(df2)
analysis_sentence

In [213]:
analysis_sentence.to_pickle('sentence_analysis.pkl')

## Get just the polarity score from the sentence and word analysis

In [11]:
analysis_word = pd.read_pickle('word_analysis.pkl')
analysis_sentence = pd.read_pickle('sentence_analysis.pkl')

In [12]:
# call the function get_polarity on the word analysis df
word_polarity = get_polarity(analysis_word)
word_polarity

Unnamed: 0,X,y
2007,"[0.1, -0.125, -0.2, -0.05, -0.05, -0.125, -0.1...",1.77857
2008,"[0.1, -0.125, -0.2, 0.7, -0.05, -0.05, -0.125,...",-0.291621
2009,"[0.1, -0.125, -0.2, -0.05, -0.05, -0.125, -0.1...",-2.77553
2010,"[0.1, -0.125, -0.2, -0.05, -0.05, -0.125, -0.1...",2.53192
2011,"[0.1, -0.125, -0.2, -0.05, -0.05, -0.125, -0.1...",1.60145
2012,"[-0.125, 0.21428571428571427, 0.1, 0.375, -0.0...",2.22403
2013,"[-0.125, 0.21428571428571427, 0.1, 0.375, -0.0...",1.67733
2014,"[-0.125, 0.21428571428571427, 0.1, 0.375, -0.0...",2.56919
2015,"[-0.125, 0.21428571428571427, 0.1, 0.375, -0.0...",2.86159
2016,"[-0.125, 0.21428571428571427, 0.1, 0.375, -0.0...",1.48528


In [16]:
# call the function get_polarity on the sentence analysis df
sentence_polarity = get_polarity(analysis_sentence)
sentence_polarity

Unnamed: 0,X,y
2007,"[-0.0140625, -0.05, -0.05, -0.125, -0.06666666...",1.77857
2008,"[-0.011842105263157895, 0.7, -0.05, -0.05, -0....",-0.291621
2009,"[-0.0140625, -0.05, -0.05, -0.125, -0.06666666...",-2.77553
2010,"[-0.016071428571428573, -0.05, -0.05, -0.125, ...",2.53192
2011,"[-0.013235294117647059, -0.05, -0.05, -0.125, ...",1.60145
2012,"[-0.041666666666666664, 0.027380952380952377, ...",2.22403
2013,"[-0.041666666666666664, 0.027380952380952377, ...",1.67733
2014,"[-0.041666666666666664, 0.027380952380952377, ...",2.56919
2015,"[-0.041666666666666664, 0.027380952380952377, ...",2.86159
2016,"[-0.041666666666666664, 0.027380952380952377, ...",1.48528


# Model Analysis

### Words

In [17]:
### Sentences

In [18]:
# split Xs into individual columns and run analysis
sentences_split_Xs = pd.DataFrame(sentence_polarity.X.tolist())
sentence_split = model_analysis(np.array(sentences_split_Xs),np.array(sentence_polarity['y']))

# get the sum of all the scores for each year and run analysis
summed_sentence = np.array(sentence_polarity['X'].map(lambda x: sum(x)))
Sum_sentence = model_analysis(summed_sentence,np.array(sentence_polarity['y']))



In [19]:
models = [word_split,Sum_word,sentence_split,Sum_sentence]
name = ['word_split','Sum_word','sentence_split','Sum_sentence']
for i in range(len(models)):
    print(name[i],models[i][2])

ValueError: Found array with dim 3. Estimator expected <= 2.

# Final Model

Since the best model from the code above we found was the Sum_word, we see if we can fit the model better by editing the parameters of subjectivity and polarity scores

In [None]:
df = 

In [20]:
def text_analysis_word(df):
    score = []
    for word in text:
        blob = TextBlob(word)
        sent = blob.sentiment
        # eliminate words with high subjectivity
        if sent.subjectivity < 0.7:
            if sent.polarity > 0.0:
                score.append(sent.polarity)
            elif sent.polarity < 0.0:
                score.append(sent.polarity)
    return score

In [21]:
# the best parameters of the dataset for the model
word_polarity_low = low_subjectivity(analysis_word)

In [22]:
# get the sum of all the scores for each year
summed_word_low = np.array(word_polarity_low['X'].map(lambda x: sum(x)))
Sum_word_best = model_analysis(summed_word_low,np.array(word_polarity_low['y']))
print(Sum_word_best)
print('Best score: ',Sum_word_best[2])

  if sys.path[0] == '':
