In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import metrics
from sklearn.metrics import PrecisionRecallDisplay

from scipy import stats

import statsmodels.api as sm
import statsmodels.formula.api as smf

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
df = pd.read_csv('Final Data.txt', sep='^', header = None)
df.columns = ['Link', 'Title', 'Author', 'Year', 'Gender', 'Anger', 'Anticipation', 'Disgust', 'Fear', 'Joy', 'Negative', 'Positive', 'Sadness', 'Surprise', 'Trust', 'Number of Hits','Pos_Title','Neg_Title','Fertility', 'War', 'Recession']
df.head(2)

Unnamed: 0,Link,Title,Author,Year,Gender,Anger,Anticipation,Disgust,Fear,Joy,...,Positive,Sadness,Surprise,Trust,Number of Hits,Pos_Title,Neg_Title,Fertility,War,Recession
0,https://www.gutenberg.org/cache/epub/28561/pg2...,Josephine Makers of History,Abbot Jacob,1841,m,2761,2782,2046,3674,1889,...,7673,2424,1082,3429,16173,0,0,5.527,0.0,1.0
1,https://www.gutenberg.org/files/4667/4667-0.txt,Seven Wives and Seven Prisons,L.A. Abbott,1870,m,1913,1683,1541,2187,1165,...,4493,1931,717,2305,10517,0,1,4.988,0.0,0.75


In [3]:
data_df = df.drop(['Link','Title','Author'], axis=1)
data_df.head(2)

Unnamed: 0,Year,Gender,Anger,Anticipation,Disgust,Fear,Joy,Negative,Positive,Sadness,Surprise,Trust,Number of Hits,Pos_Title,Neg_Title,Fertility,War,Recession
0,1841,m,2761,2782,2046,3674,1889,6044,7673,2424,1082,3429,16173,0,0,5.527,0.0,1.0
1,1870,m,1913,1683,1541,2187,1165,4011,4493,1931,717,2305,10517,0,1,4.988,0.0,0.75


In [4]:
#convert gender to numeric inputs 
data_df['Gender']=data_df['Gender'].map({'m': 1, 'f': 0})
data_df.loc[34,'Gender'] = 1
#This line is needed because there is an error with mapping in my data
data_df.head(2)

copy_df = data_df

In [5]:
#normalize scores using number of hits to account for different book lengths
for b in data_df.columns:
    if (b!='Year' and b!='Gender' and b!='Pos_Title' and b!='Neg_Title' and b!='Fertility' and b!='War' and b!='Recession'):
        data_df[b] = data_df[b]/data_df['Number of Hits']

data_df = data_df.drop(['Number of Hits'], axis=1)
data_df.head(2)

Unnamed: 0,Year,Gender,Anger,Anticipation,Disgust,Fear,Joy,Negative,Positive,Sadness,Surprise,Trust,Pos_Title,Neg_Title,Fertility,War,Recession
0,1841,1.0,0.170717,0.172015,0.126507,0.227169,0.1168,0.373709,0.474433,0.149879,0.066902,0.21202,0,0,5.527,0.0,1.0
1,1870,1.0,0.181896,0.160027,0.146525,0.207949,0.110773,0.381383,0.427213,0.183607,0.068175,0.219169,0,1,4.988,0.0,0.75


In [6]:
#use z-normalization to distribute scores between 0 and 1
Anger = (data_df['Anger']-min(data_df['Anger']))/(max(data_df['Anger'])-min(data_df['Anger']))
Anticipation = (data_df['Anticipation']-min(data_df['Anticipation']))/(max(data_df['Anticipation'])-min(data_df['Anticipation']))
Disgust = (data_df['Disgust']-min(data_df['Disgust']))/(max(data_df['Disgust'])-min(data_df['Disgust']))
Fear = (data_df['Fear']-min(data_df['Fear']))/(max(data_df['Fear'])-min(data_df['Fear']))
Joy = (data_df['Joy']-min(data_df['Joy']))/(max(data_df['Joy'])-min(data_df['Joy']))
Negative = (data_df['Negative']-min(data_df['Negative']))/(max(data_df['Negative'])-min(data_df['Negative']))
Positive = (data_df['Positive']-min(data_df['Positive']))/(max(data_df['Positive'])-min(data_df['Positive']))
Sadness = (data_df['Sadness']-min(data_df['Sadness']))/(max(data_df['Sadness'])-min(data_df['Sadness']))
Surprise = (data_df['Surprise']-min(data_df['Surprise']))/(max(data_df['Surprise'])-min(data_df['Surprise']))
Trust = (data_df['Trust']-min(data_df['Trust']))/(max(data_df['Trust'])-min(data_df['Trust']))

data_df['Anger'] = Anger
data_df['Anticipation'] = Anticipation
data_df['Disgust'] = Disgust
data_df['Fear'] = Fear
data_df['Joy'] = Joy
data_df['Negative'] = Negative
data_df['Positive'] = Positive
data_df['Sadness'] = Sadness
data_df['Surprise'] = Surprise
data_df['Trust'] = Trust

In [7]:
data_df[['Year', 'Anger', 'Anticipation', 'Disgust', 'Fear', 'Joy', 'Negative', 'Positive', 'Sadness', 'Surprise', 'Trust']].corr()

Unnamed: 0,Year,Anger,Anticipation,Disgust,Fear,Joy,Negative,Positive,Sadness,Surprise,Trust
Year,1.0,-0.063579,-0.095985,-0.187012,-0.165636,-0.166112,-0.020813,-0.10965,-0.221648,0.046936,-0.04633
Anger,-0.063579,1.0,0.020872,0.237444,0.687023,-0.201113,0.643566,-0.605574,0.575091,0.341123,-0.453056
Anticipation,-0.095985,0.020872,1.0,-0.484322,0.230459,0.701798,0.00881,0.245865,0.3873,0.447588,-0.048186
Disgust,-0.187012,0.237444,-0.484322,1.0,0.13141,-0.317911,0.219094,-0.263279,0.175633,-0.28133,0.05636
Fear,-0.165636,0.687023,0.230459,0.13141,1.0,-0.112665,0.57697,-0.538899,0.560329,0.176256,-0.298489
Joy,-0.166112,-0.201113,0.701798,-0.317911,-0.112665,1.0,-0.234097,0.599847,0.228291,0.32849,0.043935
Negative,-0.020813,0.643566,0.00881,0.219094,0.57697,-0.234097,1.0,-0.764656,0.534816,0.120753,-0.323783
Positive,-0.10965,-0.605574,0.245865,-0.263279,-0.538899,0.599847,-0.764656,1.0,-0.326023,-0.056609,0.313909
Sadness,-0.221648,0.575091,0.3873,0.175633,0.560329,0.228291,0.534816,-0.326023,1.0,0.461111,-0.476985
Surprise,0.046936,0.341123,0.447588,-0.28133,0.176256,0.32849,0.120753,-0.056609,0.461111,1.0,-0.554531


In [8]:
#part above is used to attempt OLS - we see that some variables have a relationship with year, 
#but the R^2 vales are really low. This means we need to try multiple regression


In [9]:
mlr = smf.ols(formula="Sadness ~ Year", data=data_df).fit() #0.003
print(mlr.summary())
mlr = smf.ols(formula="Disgust ~ Year", data=data_df).fit() #0.014
print(mlr.summary())
mlr = smf.ols(formula="Joy ~ Year", data=data_df).fit() #0.029
print(mlr.summary())
mlr = smf.ols(formula="Fear ~ Year", data=data_df).fit() #0.030
print(mlr.summary())
mlr = smf.ols(formula="Positive ~ Year", data=data_df).fit() #0.152
print(mlr.summary())

                            OLS Regression Results                            
Dep. Variable:                Sadness   R-squared:                       0.049
Model:                            OLS   Adj. R-squared:                  0.044
Method:                 Least Squares   F-statistic:                     8.783
Date:                Mon, 21 Nov 2022   Prob (F-statistic):            0.00348
Time:                        19:08:03   Log-Likelihood:                 97.527
No. Observations:                 172   AIC:                            -191.1
Df Residuals:                     170   BIC:                            -184.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.8405      0.801      3.545      0.0

In [10]:
mlr = smf.ols(formula="Negative ~ Neg_Title", data=data_df).fit()
print(mlr.summary())

                            OLS Regression Results                            
Dep. Variable:               Negative   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                 -0.003
Method:                 Least Squares   F-statistic:                    0.4792
Date:                Mon, 21 Nov 2022   Prob (F-statistic):              0.490
Time:                        19:08:03   Log-Likelihood:                 80.991
No. Observations:                 172   AIC:                            -158.0
Df Residuals:                     170   BIC:                            -151.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.4982      0.012     40.389      0.0

In [11]:
X = data_df[['Neg_Title']]
Y = data_df['Negative']
X = sm.add_constant(X)
fit = sm.OLS(Y, X).fit()
print(fit.pvalues[1])

0.48971440197478266


  x = pd.concat(x[::order], 1)


In [12]:
data_df[['Fertility','War','Recession','Year']].corr()

Unnamed: 0,Fertility,War,Recession,Year
Fertility,1.0,-0.090444,-0.021362,-0.692357
War,-0.090444,1.0,-0.347136,0.079583
Recession,-0.021362,-0.347136,1.0,-0.015846
Year,-0.692357,0.079583,-0.015846,1.0


In [13]:
v_df = data_df[['Year','Gender','Pos_Title','Neg_Title','Fertility','War','Recession']]
output = pd.DataFrame()
output['Predictor'] = v_df.columns
length = len(v_df.loc[0])
Score = np.zeros(length)

for i in range(length):
    Score[i] = variance_inflation_factor(v_df.values, i)

output['Score'] = Score
print(output)

   Predictor      Score
0       Year  15.623402
1     Gender   2.139805
2  Pos_Title   1.354381
3  Neg_Title   1.159299
4  Fertility   5.803117
5        War   1.494486
6  Recession   7.295704


In [14]:
# Without year
v_df = data_df[['Gender','Pos_Title','Neg_Title','Fertility','War','Recession']]
output = pd.DataFrame()
output['Predictor'] = v_df.columns
length = len(v_df.loc[0])
Score = np.zeros(length)

for i in range(length):
    Score[i] = variance_inflation_factor(v_df.values, i)

output['Score'] = Score
print(output)

   Predictor     Score
0     Gender  1.917703
1  Pos_Title  1.335647
2  Neg_Title  1.159286
3  Fertility  3.721952
4        War  1.210203
5  Recession  3.753446


In [15]:
mlr = smf.ols(formula="Positive ~ Gender + Pos_Title + Neg_Title + Fertility + War + Recession", data=data_df).fit()
print(mlr.summary())

                            OLS Regression Results                            
Dep. Variable:               Positive   R-squared:                       0.048
Model:                            OLS   Adj. R-squared:                  0.013
Method:                 Least Squares   F-statistic:                     1.381
Date:                Mon, 21 Nov 2022   Prob (F-statistic):              0.225
Time:                        19:08:03   Log-Likelihood:                 115.44
No. Observations:                 172   AIC:                            -216.9
Df Residuals:                     165   BIC:                            -194.8
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.4637      0.039     11.912      0.0

In [16]:
#the multiple linear regression still sucks - we get rid of 

In [17]:
#logistic regression may not work well bk of multicollinearity --> we may need to do a random forest approach (resolve multicollinearity)

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier #continuous data I don't use this
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics 

In [19]:
features = data_df
labels = np.array(features['Positive'])
features = features.drop(['Anger','Anticipation','Disgust','Fear','Joy','Negative','Positive','Sadness','Surprise','Trust'],axis=1)
feature_list = list(features.columns)
features = np.array(features)
# copy_list = list(copy_df.columns)
# copy = np.array(copy_df)

In [20]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.30)

In [21]:
print(train_features.shape)
print(train_labels.shape)
print(test_features.shape)
print(test_labels.shape)

(120, 7)
(120,)
(52, 7)
(52,)


In [22]:
rf = RandomForestRegressor(n_estimators = 1000)

In [23]:
rf.fit(train_features, train_labels)

RandomForestRegressor(n_estimators=1000)

In [24]:
predictions = rf.predict(test_features)
errors = abs(predictions-test_labels)
print(np.mean(errors))

0.12118291595039596


In [25]:
mape = 100*(errors/test_labels)
accuracy = 100-np.mean(mape)
print(accuracy)

-inf


  mape = 100*(errors/test_labels)


In [26]:
#Now Lets Make a Method that Does this

In [27]:
def random_forest_acc(inp_str,num_trees,test_prop):
    features = data_df
    labels = np.array(features[inp_str])
    features = features.drop(['Anger','Anticipation','Disgust','Fear','Joy','Negative','Positive','Sadness','Surprise','Trust'],axis=1)
    feature_list = list(features.columns)
    features = np.array(features)
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = test_prop)
    rf = RandomForestRegressor(n_estimators = num_trees)
    rf.fit(train_features, train_labels)
    predictions = rf.predict(test_features)
    errors = abs(predictions-test_labels)
    mape = 100*(errors/test_labels)
    accuracy = 100-np.mean(mape)
    return(accuracy)

In [28]:
Anger_ACC = random_forest_acc('Anger',1000,0.3)
print(Anger_ACC)

43.1472768370055


In [29]:
Anticipation_ACC = random_forest_acc('Anticipation',1000,0.3)
print(Anticipation_ACC)

73.64244178432732


In [30]:
Disgust_ACC = random_forest_acc('Disgust',1000,0.3)
print(Disgust_ACC)

-inf


  mape = 100*(errors/test_labels)


In [31]:
Fear_ACC = random_forest_acc('Fear',1000,0.3)
print(Fear_ACC)

61.077068717738015


In [32]:
Joy_ACC = random_forest_acc('Joy',1000,0.3)
print(Joy_ACC)

61.91920377434027


In [33]:
Negative_ACC = random_forest_acc('Negative',1000,0.3)
print(Negative_ACC)

57.19234222501184


In [34]:
Positive_ACC = random_forest_acc('Positive',1000,0.3)
print(Positive_ACC)

76.35363094675813


In [35]:
Sadness_ACC = random_forest_acc('Sadness',1000,0.3)
print(Sadness_ACC)

68.57956564287831


In [36]:
Surprise_ACC = random_forest_acc('Surprise',1000,0.3)
print(Surprise_ACC)

-inf


  mape = 100*(errors/test_labels)


In [37]:
Trust_ACC = random_forest_acc('Trust',1000,0.3)
print(Trust_ACC)

48.537278643200594


In [38]:
def random_forest_acc(inp_str,num_trees,test_prop,df):
    features = df
    labels = np.array(features[inp_str])
    features = features.drop(['Anger','Anticipation','Disgust','Fear','Joy','Negative','Positive','Sadness','Surprise','Trust'],axis=1)
    feature_list = list(features.columns)
    features = np.array(features)
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = test_prop)
    rf = RandomForestRegressor(n_estimators = num_trees)
    rf.fit(train_features, train_labels)
    predictions = rf.predict(test_features)
    errors = abs(predictions-test_labels)
    mn_errors = abs(np.mean(train_labels)-test_labels)
    md_errors = abs(np.median(train_labels)-test_labels)
    mape = 100*(errors/test_labels)
    mn_mape = 100*(mn_errors/test_labels)
    md_mape = 100*(md_errors/test_labels)
    accuracy = 100-np.mean(mape)
    mn_accuracy = 100-np.mean(mn_mape)
    md_accuracy = 100-np.mean(md_mape)
    print('Mean yields: ' + str(mn_accuracy))
    print('Median yields: ' + str(md_accuracy))
    return(accuracy)

In [44]:
Anger_ACC = random_forest_acc('Anger',1000,0.3,copy_df)
print(Anger_ACC)
Anticipation_ACC = random_forest_acc('Anticipation',1000,0.3,copy_df)
print(Anticipation_ACC)
Disgust_ACC = random_forest_acc('Disgust',1000,0.3,copy_df)
print(Disgust_ACC)
Fear_ACC = random_forest_acc('Fear',1000,0.3,copy_df)
print(Fear_ACC)
Joy_ACC = random_forest_acc('Joy',1000,0.3,copy_df)
print(Joy_ACC)
Negative_ACC = random_forest_acc('Negative',1000,0.3,copy_df)
print(Negative_ACC)
Positive_ACC = random_forest_acc('Positive',1000,0.3,copy_df)
print(Positive_ACC)
Sadness_ACC = random_forest_acc('Sadness',1000,0.3,copy_df)
print(Sadness_ACC)
Surprise_ACC = random_forest_acc('Surprise',1000,0.3,copy_df)
print(Surprise_ACC)
Trust_ACC = random_forest_acc('Trust',1000,0.3,copy_df)
print(Trust_ACC)

Compare against91.28647154917442
91.04087554800235
Compare against90.58392885356398
89.9672851847199
Compare against92.53918979403883
91.84568820995733
Compare against90.56909191585802
90.92187371160064
Compare against87.84708891042762
85.39695778747055
Compare against95.33310089956319
94.27817950555938
Compare against96.25404308702794
95.83727578496155
Compare against91.8156628637055
90.86572726776612
Compare against88.10267523996745
88.09763506783753
Compare against90.35035553245908
90.34911409581737
