In [1]:
import pandas as pd
import numpy as np
from glob2 import glob
from sklearn.decomposition import PCA
from scipy.linalg import norm 
import plotly_express as px
import seaborn as sns
sns.set(style = 'ticks')
%matplotlib inline
import os 
os.chdir('/Users/gracelyons/Desktop/MSDS/Capstone/')

## NRC

In [2]:
nrc = pd.read_csv('NRC.csv')

In [3]:
nrc.head()

Unnamed: 0,word,sentiment
0,abacus,trust
1,abandon,fear
2,abandon,negative
3,abandon,sadness
4,abandoned,anger


In [4]:
sentiments = nrc.sentiment.unique()

In [5]:
d = {}
for sent in sentiments:
    d[f"df_{sent}"] = nrc[nrc.sentiment == sent].reset_index().drop('index', axis = 1)
#d   

In [6]:
d['df_trust'].head()

Unnamed: 0,word,sentiment
0,abacus,trust
1,abbot,trust
2,absolution,trust
3,abundance,trust
4,academic,trust


In [8]:
token = pd.read_csv('output files/TOKEN.csv')

In [9]:
token.head()

Unnamed: 0,speaker_id,line_num,sent_num,token_num,pos_tuple,pos,token_str,term_str,term_id
0,201,1,0,0,"('Good', 'JJ')",JJ,Good,good,542
1,201,1,0,1,"('morning', 'NN')",NN,morning,morning,828
2,201,1,0,2,"('class', 'NN')",NN,class,class,231
3,201,1,1,0,"('I', 'PRP')",PRP,I,i,621
4,201,1,1,1,"(""'m"", 'VBP')",VBP,'m,m,764


In [10]:
for sent in sentiments:
    token[sent] = token.apply(lambda x: 1 if x.term_str in d[f"df_{sent}"].word.values else 0, axis = 1)
    
token.head()

Unnamed: 0,speaker_id,line_num,sent_num,token_num,pos_tuple,pos,token_str,term_str,term_id,trust,fear,negative,sadness,anger,surprise,positive,disgust,joy,anticipation
0,201,1,0,0,"('Good', 'JJ')",JJ,Good,good,542,1,0,0,0,0,1,1,0,1,1
1,201,1,0,1,"('morning', 'NN')",NN,morning,morning,828,0,0,0,0,0,0,0,0,0,0
2,201,1,0,2,"('class', 'NN')",NN,class,class,231,0,0,0,0,0,0,0,0,0,0
3,201,1,1,0,"('I', 'PRP')",PRP,I,i,621,0,0,0,0,0,0,0,0,0,0
4,201,1,1,1,"(""'m"", 'VBP')",VBP,'m,m,764,0,0,0,0,0,0,0,0,0,0


In [11]:
token_grouped = token.groupby('speaker_id').sum().reset_index().drop(['line_num', 'sent_num', 'token_num', 'term_id'], axis = 1)

In [12]:
token_grouped.head()

Unnamed: 0,speaker_id,trust,fear,negative,sadness,anger,surprise,positive,disgust,joy,anticipation
0,201,19,4,14,4,2,6,30,1,17,18
1,202,21,0,1,0,0,3,32,0,10,13
2,203,28,2,4,6,1,0,47,1,3,8
3,204,17,3,6,4,0,4,27,0,10,16
4,205,17,1,0,0,0,4,33,0,11,10


In [13]:
lengths = token.groupby('speaker_id').size().values

In [14]:
token_grouped['length'] = lengths

In [15]:
token_grouped.head()

Unnamed: 0,speaker_id,trust,fear,negative,sadness,anger,surprise,positive,disgust,joy,anticipation,length
0,201,19,4,14,4,2,6,30,1,17,18,514
1,202,21,0,1,0,0,3,32,0,10,13,673
2,203,28,2,4,6,1,0,47,1,3,8,673
3,204,17,3,6,4,0,4,27,0,10,16,533
4,205,17,1,0,0,0,4,33,0,11,10,569


In [16]:
token_grouped_normalized = pd.DataFrame()
for sent in sentiments:
    token_grouped_normalized[f'{sent}_norm'] = token_grouped.apply(lambda x: (x[sent]/ x['length']) if x[sent] != 0 else 0, axis = 1)

In [17]:
token_grouped_normalized['speaker_id'] = token_grouped.speaker_id

In [19]:
survey = pd.read_excel('survey_data.xlsx', usecols = ['studyid', 'assignment']).rename(columns = {'studyid': 'speaker_id', 'assignment': 'mindfulness'})

In [20]:
token_grouped_normalized = token_grouped_normalized.merge(survey)

In [22]:
ho_lookup = pd.read_csv('holdout_samples_lookup.csv')

In [23]:
token_grouped_normalized = token_grouped_normalized.merge(ho_lookup, left_on = 'speaker_id', right_on = 'studyid').drop(['studyid', 'mindfulness_y', '300s'], axis = 1).rename(columns = {'mindfulness_x': 'mindfulness'}).set_index('speaker_id')

In [24]:
token_grouped_normalized.head()

Unnamed: 0_level_0,trust_norm,fear_norm,negative_norm,sadness_norm,anger_norm,surprise_norm,positive_norm,disgust_norm,joy_norm,anticipation_norm,mindfulness,holdout
speaker_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
201,0.036965,0.007782,0.027237,0.007782,0.003891,0.011673,0.058366,0.001946,0.033074,0.035019,1,0
202,0.031204,0.0,0.001486,0.0,0.0,0.004458,0.047548,0.0,0.014859,0.019316,0,0
203,0.041605,0.002972,0.005944,0.008915,0.001486,0.0,0.069837,0.001486,0.004458,0.011887,1,1
204,0.031895,0.005629,0.011257,0.007505,0.0,0.007505,0.050657,0.0,0.018762,0.030019,0,0
205,0.029877,0.001757,0.0,0.0,0.0,0.00703,0.057996,0.0,0.019332,0.017575,1,0


In [49]:
token_grouped_normalized.to_csv('output files/nrc_normalized.csv')

In [25]:
X = token_grouped_normalized[token_grouped_normalized.holdout == 0].drop(['mindfulness', 'holdout'], axis = 1)
y = token_grouped_normalized[token_grouped_normalized.holdout == 0].mindfulness
X_test = token_grouped_normalized[token_grouped_normalized.holdout == 1].drop(['mindfulness', 'holdout'], axis = 1)
y_test = token_grouped_normalized[token_grouped_normalized.holdout == 1].mindfulness

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [29]:
log_reg = LogisticRegression(penalty = 'l2', solver = 'newton-cg', C = 10, random_state = 0).fit(X, y)

In [30]:
log_reg.coef_

array([[-1.26523655,  0.32247171,  0.31524263,  0.22859735,  0.01031457,
        -0.50609621, -1.12443453,  0.06477791, -0.87841044, -0.90028629]])

In [31]:
log_reg.intercept_

array([0.0675756])

In [32]:
pred = log_reg.predict(X_test)

In [33]:
prediction = list(map(round, pred))

In [34]:
accuracy_score(y_test, prediction)

0.5

In [35]:
confusion_matrix(y_test, prediction)

array([[9, 0],
       [9, 0]])

In [36]:
print(classification_report(y_test, log_reg.predict(X_test)))

              precision    recall  f1-score   support

           0       0.50      1.00      0.67         9
           1       0.00      0.00      0.00         9

    accuracy                           0.50        18
   macro avg       0.25      0.50      0.33        18
weighted avg       0.25      0.50      0.33        18



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
import statsmodels.api as sm

In [38]:
log_reg_sm = sm.Logit(y, X).fit(method = 'newton')

Optimization terminated successfully.
         Current function value: 0.632186
         Iterations 7


In [39]:
print(log_reg_sm.summary())

                           Logit Regression Results                           
Dep. Variable:            mindfulness   No. Observations:                   71
Model:                          Logit   Df Residuals:                       61
Method:                           MLE   Df Model:                            9
Date:                Sun, 26 Mar 2023   Pseudo R-squ.:                 0.08677
Time:                        18:00:05   Log-Likelihood:                -44.885
converged:                       True   LL-Null:                       -49.150
Covariance Type:            nonrobust   LLR p-value:                    0.4818
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
trust_norm          -59.0415     36.648     -1.611      0.107    -130.871      12.788
fear_norm            66.6047     77.933      0.855      0.393     -86.141     219.350
negative_norm        95.

In [40]:
pred_sm = log_reg_sm.predict(X_test)

In [41]:
prediction = list(map(round, pred_sm))

In [42]:
print('actual values', list(y_test.values))
print('predicitions', prediction)

actual values [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0]
predicitions [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1]


In [43]:
cm_sm = confusion_matrix(y_test, prediction)
cm_sm

array([[7, 2],
       [7, 2]])

In [44]:
accuracy_score(y_test,prediction)

0.5

In [45]:
log_reg_sm.summary()

0,1,2,3
Dep. Variable:,mindfulness,No. Observations:,71.0
Model:,Logit,Df Residuals:,61.0
Method:,MLE,Df Model:,9.0
Date:,"Sun, 26 Mar 2023",Pseudo R-squ.:,0.08677
Time:,18:00:09,Log-Likelihood:,-44.885
converged:,True,LL-Null:,-49.15
Covariance Type:,nonrobust,LLR p-value:,0.4818

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
trust_norm,-59.0415,36.648,-1.611,0.107,-130.871,12.788
fear_norm,66.6047,77.933,0.855,0.393,-86.141,219.350
negative_norm,95.7275,82.113,1.166,0.244,-65.212,256.667
sadness_norm,-106.6012,90.683,-1.176,0.240,-284.336,71.134
anger_norm,-195.7209,156.926,-1.247,0.212,-503.290,111.848
surprise_norm,32.5013,41.784,0.778,0.437,-49.394,114.397
positive_norm,22.2140,19.902,1.116,0.264,-16.794,61.222
disgust_norm,202.3760,186.705,1.084,0.278,-163.559,568.311
joy_norm,-6.7827,47.740,-0.142,0.887,-100.351,86.786


In [46]:
log_reg_sm.summary2()

0,1,2,3
Model:,Logit,Pseudo R-squared:,0.087
Dependent Variable:,mindfulness,AIC:,109.7704
Date:,2023-03-26 18:00,BIC:,132.3972
No. Observations:,71,Log-Likelihood:,-44.885
Df Model:,9,LL-Null:,-49.15
Df Residuals:,61,LLR p-value:,0.48176
Converged:,1.0000,Scale:,1.0
No. Iterations:,7.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
trust_norm,-59.0415,36.6483,-1.6110,0.1072,-130.8708,12.7878
fear_norm,66.6047,77.9328,0.8546,0.3927,-86.1409,219.3502
negative_norm,95.7275,82.1134,1.1658,0.2437,-65.2118,256.6669
sadness_norm,-106.6012,90.6827,-1.1755,0.2398,-284.3361,71.1337
anger_norm,-195.7209,156.9259,-1.2472,0.2123,-503.2901,111.8483
surprise_norm,32.5013,41.7842,0.7778,0.4367,-49.3943,114.3969
positive_norm,22.2140,19.9022,1.1162,0.2644,-16.7936,61.2216
disgust_norm,202.3760,186.7048,1.0839,0.2784,-163.5586,568.3106
joy_norm,-6.7827,47.7399,-0.1421,0.8870,-100.3511,86.7857
