In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
#from cleanMDA import extractTable, divide_chunks, pullMDA
from textblob import TextBlob
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Sentiment Analysis by sentence

In [2]:
# company name and text in dataframe
df_sentences = pd.read_pickle('Cleaned_MDA_sentences.pkl')

In [3]:
# function to get GDP for 2007-2017 (import from GitHut)
def get_GDP_visual(df):
    USA = np.array(df[df['Data Source'] == 'United States'])

    gdp_USA = pd.DataFrame(USA, columns=['Country Name', 'Country COde', 'Indicator Name', 'Indicator Code',
                                                 '1960','1961','1962','1963','1964','1965','1966','1967','1968','1969',
                                                 '1970','1971','1972','1973','1974','1975','1976','1977','1978','1979',
                                                 '1980','1981','1982','1983','1984','1985','1986','1987','1988','1989',
                                                 '1990','1991','1992','1993','1994','1995','1996','1997','1998','1999',
                                                 '2000','2001','2002','2003','2004','2005','2006','2007','2008','2009',
                                                 '2010','2011','2012','2013','2014','2015','2016','2017','2018'])

    gdp_USA = gdp_USA.drop(columns=['Country Name','Country COde','Indicator Name','Indicator Code','1960','1961','1962',
                                   '1963','1964','1965','1966','1967','1968','1969', '1970','1971','1972','1973','1974',
                                   '1975','1976','1977','1978','1979', '1980','1981','1982','1983','1984','1985','1986',
                                   '1987','1988','1989', '1990','1991','1992','1993','1994','1995','1996','1997','1998',
                                   '1999', '2000','2001','2002','2003','2004','2005','2006','2018'])

    gdp_USA_trans = gdp_USA.transpose()
    gdp_USA_trans = gdp_USA_trans.rename(columns={0:'Y'})
    
    return gdp_USA_trans

In [4]:
gdp_df = pd.read_csv('gdp_annual.csv')
gdp_USA = get_GDP_visual(gdp_df)

In [5]:
gdp_USA

Unnamed: 0,Y
2007,1.77857
2008,-0.291621
2009,-2.77553
2010,2.53192
2011,1.60145
2012,2.22403
2013,1.67733
2014,2.56919
2015,2.86159
2016,1.48528


In [6]:
def text_analysis_sentence(text, subjective=True):
    pos = []
    neg = []
    # split for each sentence
    for word in text.split(' . '):
        blob = TextBlob(word)
        sent = blob.sentiment
        # eliminate words with high subjectivity
#         if subjective:
#             if sent.subjectivity < 0.7:
#                 if sent.polarity > 0.25:
#                     pos.append(sent.polarity)
#                 elif sent.polarity < -0.25:
#                     neg.append(sent.polarity)
        
    return word,sent.polarity,sent.subjectivity

In [7]:
# create a df to run an analysis of each company's text
analysis_df = pd.DataFrame(index=['X1','X2','X3','X4','X5','X6','X7','X8','X9','X10'])
log_analysis = pd.DataFrame(index=['X1','X2','X3','X4','X5','X6','X7','X8','X9','X10'])

In [8]:
for year in df_sentences:
    cols = []
    cols_log = []
    for i in range(len(df_sentences)):
        sum_pos = 0
        sum_neg = 0
        # combined the list of text words into a string
        pos, neg = text_analysis_sentence(' '.join(df_sentences[year][i][1]))
        # keep track of the positive and negitive total
        for i in pos:
            sum_pos += i

        for j in neg:
            sum_neg += j

        total = sum_pos+sum_neg
        cols.append(total)
        cols_log.append(np.log(total))
    # put scores in df by year
    analysis_df[year] = cols
    log_analysis[year] = cols_log



In [9]:
analysis_df = analysis_df.transpose()
analysis_df['y'] = np.array(gdp_USA['Y'])

log_analysis = log_analysis.transpose()
log_analysis['y'] = np.array(gdp_USA['Y'])

In [10]:
log_analysis = log_analysis.fillna(0)

In [11]:
# combine all the Xs into one value
comb_Xs = np.array(analysis_df.iloc[:,0:10].sum(axis=1))
single_df = pd.DataFrame(index=df_sentences.columns)
single_df['X'] = comb_Xs
single_df['y'] = analysis_df['y']

In [12]:
kfold = KFold(n_splits=5, shuffle=True, random_state=2)
kf = KFold(n_splits=5, random_state=2, shuffle=True)
#X = np.array(analysis_df.iloc[:,0:10])
#y = np.array(analysis_df['y'])
#X = np.array(single_df['X'])
#y = np.array(single_df['y'])
X = np.array(log_analysis.iloc[:,0:10])
y = np.array(log_analysis['y'])

In [16]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [17]:
# run if combined Xs dataframe
X_train = X_train[:, np.newaxis]
X_test = X_test[:, np.newaxis]

In [18]:
# run analysis on multiple models
xgb_reg = xgb.XGBRegressor(max_depth=4,n_estimators=300, n_jobs=-1, subsample=.7,random_seed=3)
lr = LinearRegression()
reg = BayesianRidge()

In [19]:
for m in [lr,reg,xgb_reg]:
    # model learns from train data
    m.fit(X_train,y_train)
    print(m)
    print(m.score(X_test,y_test))
    y_pred = m.predict(X_test)
    print(y_pred)
    print(mean_absolute_error(y_test, y_pred))
    
y_test

ValueError: Found array with dim 3. Estimator expected <= 2.

# Sentiment Analysis by words

In [None]:
df = 

In [20]:
def text_analysis_word(df):
    score = []
    for word in text:
        blob = TextBlob(word)
        sent = blob.sentiment
        # eliminate words with high subjectivity
        if sent.subjectivity < 0.7:
            if sent.polarity > 0.0:
                score.append(sent.polarity)
            elif sent.polarity < 0.0:
                score.append(sent.polarity)
    return score

In [21]:
word_analysis_df = pd.DataFrame(index=['X1','X2','X3','X4','X5','X6','X7','X8','X9','X10'])
word_log_analysis = pd.DataFrame(index=['X1','X2','X3','X4','X5','X6','X7','X8','X9','X10'])

In [22]:
for year in df_sentences:
    cols = []
    cols_log = []
    for i in range(len(df_sentences)):
        sum_total = 0
        score = text_analysis_word(df_sentences[year][i][1])
        # keep track of the positive and negitive total
        for j in score:
            sum_total += j

        cols.append(sum_total)
        cols_log.append(np.log(sum_total))
        
    # put scores in df by year
    word_analysis_df[year] = cols
    word_log_analysis[year] = cols_log

  if sys.path[0] == '':
