In [1]:
import pandas as pd
import scipy.sparse as ss
import statsmodels.api as sm

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [2]:
meta = pd.read_pickle('files/creep.pkl')
meta

Unnamed: 0,title,date,subgenre,rating,time,author,polarity,subjectivity,title_length,subgenre_count,num_words,mean_word_freq,num_unique_words,unique_ratio
0,blood magic,2020-07-24,beings entities monsters creatures cryptids ...,8.15,8,Tobias Wade,0.046569,0.501569,2,10,966,0.001789,559,0.578675
1,a diner open 25 hours a day,2020-07-23,abductions kidnappings beings entities scien...,8.08,12,Christopher Maxim,0.033834,0.514034,7,7,1369,0.001193,838,0.612126
2,a shattered life,2020-07-22,madness paranoia mental illness monsters crea...,9.10,14,Matt Dymerski,0.054970,0.531194,3,10,1496,0.001259,794,0.530749
3,always be nice to your neighbors,2020-07-21,deaths murders disappearances,7.45,4,Christine Druga,0.020018,0.402989,6,3,350,0.004367,229,0.654286
4,vantablack a death metal cult,2020-07-20,rites rituals,7.63,13,Christopher Maxim,0.120475,0.501398,5,2,1338,0.001477,677,0.505979
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3494,the corner,2008-03-23,locations sites,6.85,2,Author Unknown,0.047026,0.527637,2,2,121,0.009709,103,0.851240
3495,the gallery of henri beauchamp,2008-03-22,rites rituals,8.28,5,Author Unknown,0.148358,0.508518,5,2,543,0.002786,359,0.661142
3496,the grove,2008-03-21,locations sites,7.67,1,Author Unknown,0.343233,0.539098,2,2,89,0.016949,59,0.662921
3497,the abandoned convenience store,2008-03-20,rites rituals,7.49,1,Author Unknown,-0.027165,0.549320,4,2,106,0.011628,86,0.811321


In [3]:
X = meta.loc[:,'polarity':'unique_ratio'].join(meta['time'])
y = meta['rating']

X = (X-X.mean())/X.std()

In [4]:
mod = sm.OLS(y,X).fit()
mod.summary()

0,1,2,3
Dep. Variable:,rating,R-squared (uncentered):,0.003
Model:,OLS,Adj. R-squared (uncentered):,0.0
Method:,Least Squares,F-statistic:,1.059
Date:,"Mon, 26 Oct 2020",Prob (F-statistic):,0.39
Time:,15:42:51,Log-Likelihood:,-12083.0
No. Observations:,3499,AIC:,24180.0
Df Residuals:,3490,BIC:,24240.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
polarity,0.0006,0.130,0.005,0.996,-0.255,0.256
subjectivity,-0.0394,0.130,-0.304,0.761,-0.294,0.215
title_length,0.0624,0.130,0.478,0.633,-0.193,0.318
subgenre_count,-0.0073,0.132,-0.055,0.956,-0.266,0.251
num_words,-0.0017,0.136,-0.013,0.990,-0.268,0.265
mean_word_freq,-0.0787,0.164,-0.479,0.632,-0.401,0.243
num_unique_words,0.2074,0.380,0.545,0.586,-0.538,0.953
unique_ratio,-0.2653,0.204,-1.298,0.194,-0.666,0.135
time,-0.1256,0.360,-0.349,0.727,-0.831,0.580

0,1,2,3
Omnibus:,582.024,Durbin-Watson:,0.037
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1065.565
Skew:,-1.043,Prob(JB):,4.13e-232
Kurtosis:,4.72,Cond. No.,6.82


In [5]:
X = meta[['time', 'mean_word_freq', 'unique_ratio']]
X = (X-X.mean())/X.std()

In [6]:
mod = sm.OLS(y,X).fit()
mod.summary()

0,1,2,3
Dep. Variable:,rating,R-squared (uncentered):,0.003
Model:,OLS,Adj. R-squared (uncentered):,0.002
Method:,Least Squares,F-statistic:,2.978
Date:,"Mon, 26 Oct 2020",Prob (F-statistic):,0.0303
Time:,15:42:56,Log-Likelihood:,-12083.0
No. Observations:,3499,AIC:,24170.0
Df Residuals:,3496,BIC:,24190.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
time,0.0404,0.175,0.230,0.818,-0.304,0.385
mean_word_freq,-0.1084,0.155,-0.699,0.484,-0.412,0.195
unique_ratio,-0.2890,0.199,-1.452,0.147,-0.679,0.101

0,1,2,3
Omnibus:,588.993,Durbin-Watson:,0.037
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1080.881
Skew:,-1.053,Prob(JB):,1.9500000000000002e-235
Kurtosis:,4.726,Cond. No.,2.75


In [7]:
stories = pd.read_pickle('files/corpus.pkl')

In [8]:
# remove common noise words
extras = ['like', 'just', 'said', 'im', 'didnt', 'dont', 'did', 'youre', 'youare', 'werent']
stop_words = text.ENGLISH_STOP_WORDS.union(extras)

In [9]:
vectorizer = TfidfVectorizer(stop_words=stop_words, max_df=0.8) # upper bound to reduce noise from nonsense terms
dtm = vectorizer.fit_transform(stories['story'])
dtm

<3499x62947 sparse matrix of type '<class 'numpy.float64'>'
	with 1833609 stored elements in Compressed Sparse Row format>

In [10]:
# dimension reduction using singular values 
svd = TruncatedSVD(n_components=50)
normalizer = Normalizer(copy=False)
pca = make_pipeline(svd, normalizer)
U = pca.fit_transform(dtm)
y = meta['rating']

In [11]:
svd.explained_variance_ratio_.sum()

0.11160206562195919

In [12]:
U = (U-U.mean())/U.std()
mod = sm.OLS(y,U).fit()
mod.summary()

0,1,2,3
Dep. Variable:,rating,R-squared (uncentered):,0.954
Model:,OLS,Adj. R-squared (uncentered):,0.954
Method:,Least Squares,F-statistic:,1443.0
Date:,"Mon, 26 Oct 2020",Prob (F-statistic):,0.0
Time:,16:03:09,Log-Likelihood:,-6685.8
No. Observations:,3499,AIC:,13470.0
Df Residuals:,3449,BIC:,13780.0
Df Model:,50,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,1.6402,0.009,189.192,0.000,1.623,1.657
x2,-0.0384,0.021,-1.798,0.072,-0.080,0.003
x3,-0.3682,0.021,-17.251,0.000,-0.410,-0.326
x4,-0.1300,0.026,-4.988,0.000,-0.181,-0.079
x5,0.0225,0.026,0.859,0.390,-0.029,0.074
x6,-0.1457,0.028,-5.150,0.000,-0.201,-0.090
x7,-0.1428,0.028,-5.102,0.000,-0.198,-0.088
x8,0.1228,0.033,3.686,0.000,0.057,0.188
x9,0.3739,0.034,11.066,0.000,0.308,0.440

0,1,2,3
Omnibus:,58.48,Durbin-Watson:,1.881
Prob(Omnibus):,0.0,Jarque-Bera (JB):,104.725
Skew:,0.101,Prob(JB):,1.8199999999999997e-23
Kurtosis:,3.823,Cond. No.,10.1


In [13]:
mod = sm.OLS(y,U[:, 0]).fit()
mod.summary()

0,1,2,3
Dep. Variable:,rating,R-squared (uncentered):,0.933
Model:,OLS,Adj. R-squared (uncentered):,0.933
Method:,Least Squares,F-statistic:,48370.0
Date:,"Mon, 26 Oct 2020",Prob (F-statistic):,0.0
Time:,16:03:17,Log-Likelihood:,-7369.9
No. Observations:,3499,AIC:,14740.0
Df Residuals:,3498,BIC:,14750.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,1.5869,0.007,219.933,0.000,1.573,1.601

0,1,2,3
Omnibus:,135.371,Durbin-Watson:,1.814
Prob(Omnibus):,0.0,Jarque-Bera (JB):,161.352
Skew:,0.445,Prob(JB):,9.18e-36
Kurtosis:,3.562,Cond. No.,1.0
