## Conversation Gone Awry

In [4]:
import os

import numpy as np
import pandas as pd
from scipy import stats

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, LeaveOneGroupOut
from sklearn.feature_selection import f_classif, SelectPercentile

from collections import defaultdict
from functools import partial
from multiprocessing import Pool

from convokit import download
from convokit.prompt_types import PromptTypeWrapper
from convokit import PolitenessStrategies
from convokit import Corpus, Speaker, Utterance
from convokit import Classifier

import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
import warnings
warnings.filterwarnings('ignore')

#### Load dataset

In [6]:
# OPTION 1: DOWNLOAD CORPUS 
# UNCOMMENT THESE LINES TO DOWNLOAD CORPUS
# DATA_DIR = ''
#AWRY_ROOT_DIR = download('conversations-gone-awry-corpus', data_dir=DATA_DIR)

# OPTION 2: READ PREVIOUSLY-DOWNLOADED CORPUS FROM DISK
# UNCOMMENT THIS LINE AND REPLACE WITH THE DIRECTORY WHERE THE TENNIS-CORPUS IS LOCATED
DATA_DIR = 'E:\EPFL\Courses\Ada\Project P4\conversations-gone-awry-corpus'
AWRY_ROOT_DIR = download('conversations-gone-awry-corpus', data_dir=DATA_DIR)

awry_corpus = Corpus(AWRY_ROOT_DIR)
awry_corpus.load_info('utterance',['parsed'])

awry_corpus = awry_corpus.filter_conversations_by(lambda convo: convo.meta['annotation_year'] == '2018')

Dataset already exists at E:\EPFL\Courses\Ada\Project P4\conversations-gone-awry-corpus\conversations-gone-awry-corpus


In [7]:
awry_corpus.print_summary_stats()

Number of Speakers: 2010
Number of Utterances: 6363
Number of Conversations: 1168


In [8]:
utterances = awry_corpus.get_utterances_dataframe()

In [9]:
utterances.sample(10)

Unnamed: 0_level_0,timestamp,text,speaker,reply_to,conversation_id,meta.is_section_header,meta.comment_has_personal_attack,meta.toxicity,meta.parsed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
320261990.20239.20239,1255720000.0,"Réginald, first of all, mea culpa. I hadn't n...",Rivertorch,320251126.19827.19814,320251126.19814.19814,False,False,0.041406,"[{'rt': 1, 'toks': [{'tok': ' ', 'tag': '', 'd..."
202186098.20704.20684,1206920000.0,"and your text is klunky, sorry.",72.0.180.2,202186098.20684.20684,202186098.20684.20684,False,False,0.20241,"[{'rt': 3, 'toks': [{'tok': 'and', 'tag': 'CC'..."
143878980.125.125,1184120000.0,"Friday did as much as he can, due to the curre...",Zscout370,143878851.103.103,143877368.8.8,False,False,0.0239587,"[{'rt': 1, 'toks': [{'tok': 'Friday', 'tag': '..."
303464344.31333.31333,1248230000.0,"I see no reason how ""T.N.A"" is widely known ei...",Truco,303455587.31141.31141,303378306.30420.30420,False,False,0.0169346,"[{'rt': 1, 'toks': [{'tok': 'I', 'tag': 'PRP',..."
397617990.24140.24120,1290140000.0,"Hi, I noticed is edit warring on the wiki_lin...",Aeonx,397617990.24120.24120,397617990.24120.24120,False,False,0.388942,"[{'rt': 3, 'toks': [{'tok': 'Hi', 'tag': 'UH',..."
62531862.7260.7260,1152270000.0,"The ""Old"" onscreen character of Shawn Michaels...",Da Main Event,46687240.5546.5546,46687240.5546.5546,False,False,0.233057,"[{'rt': 11, 'toks': [{'tok': 'The', 'tag': 'DT..."
162520051.1295.1295,1191610000.0,"To re-iterate what ''Metros'' said, the sectio...",Kralizec!,162513218.1202.1202,159674239.677.485,False,False,0.110268,"[{'rt': 16, 'toks': [{'tok': 'To', 'tag': 'TO'..."
193649667.12093.12093,1203850000.0,\n== Edit warring on [WIKI_LINK: 9/11 conspira...,Ice Cold Beer,,193649667.12093.12093,True,False,0.0,"[{'rt': 1, 'toks': [{'tok': '\n', 'tag': '', '..."
228520950.6187.6187,1217300000.0,I can't work out whether you are an idiot or w...,Matilda,228468167.5015.5005,228468167.5005.5005,False,True,0.826419,"[{'rt': 3, 'toks': [{'tok': 'I', 'tag': 'PRP',..."
80315439.464.464,1160350000.0,"I agree, great work. I, for some reason, wasn'...",LiquidGhoul,80287913.451.432,80287913.432.432,False,False,0.00942597,"[{'rt': 1, 'toks': [{'tok': 'I', 'tag': 'PRP',..."


In [10]:
"""
cnt = 0
for utt in awry_corpus.iter_utterances():
    cnt += 1
    print(ps.transform_utterance(utt))
    if cnt > 2:
        break
"""

'\ncnt = 0\nfor utt in awry_corpus.iter_utterances():\n    cnt += 1\n    print(ps.transform_utterance(utt))\n    if cnt > 2:\n        break\n'

#### Predict Politeness: Use wiki_corpus for training, then apply the classifier on awry_corpus to get the prediction score

In [11]:
# Downloading the wikipedia portion of annotated data
wiki_corpus = Corpus(download("wikipedia-politeness-corpus"))

Dataset already exists at C:\Users\Siran\.convokit\downloads\wikipedia-politeness-corpus


In [12]:
wiki_corpus.print_summary_stats()

Number of Speakers: 1
Number of Utterances: 4353
Number of Conversations: 4353


In [13]:
ps = PolitenessStrategies()
awry_corpus = ps.transform(awry_corpus, markers=True)
wiki_corpus = ps.transform(wiki_corpus, markers=True)

In [14]:
test_ids = awry_corpus.get_utterance_ids()

In [15]:
train_corpus = Corpus(utterances=[utt for utt in wiki_corpus.iter_utterances()])
test_corpus = Corpus(utterances=[utt for utt in awry_corpus.iter_utterances()])
print("train size = {}, test size = {}".format(len(train_corpus.get_utterance_ids()),
                                               len(test_corpus.get_utterance_ids())))

train size = 4353, test size = 6363


In [16]:
clf = Classifier(obj_type="utterance", 
                        pred_feats=["politeness_strategies"], 
                        labeller=lambda utt: utt.meta['Binary'] == 1)
clf.fit(train_corpus)

Initialized default classification model (standard scaled logistic regression).


<convokit.classifier.classifier.Classifier at 0x26414761a60>

In [17]:
test_pred = clf.transform(test_corpus)

In [18]:
pred_df = clf.summarize(test_pred)
pred_df.head(20)

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
132662410.11977.11977,0,0.009742
410838569.61940.61940,0,0.019028
236161634.27143.27131,0,0.020131
444898053.3389.3389,0,0.020926
418695946.3459.3449,0,0.020926
36146600.3862.3862,0,0.021219
606915191.4713.4713,0,0.022217
408191172.17962.17962,0,0.022496
344941369.12789.12789,0,0.023541
123265296.5701.5701,0,0.024656


In [19]:
utterances.loc['473786014.2964.2956'].text

' I dont want to seem rude or anything, but is there a reason external_link has gone unanswered and several have been resolved below? This editor and his suspected sockpuppets are continuing to commit rogue edits and action needs to be taken. Its getting frustrating to work on the page in question. Thanks,'

In [20]:
utt_with_attack = utterances[utterances['meta.comment_has_personal_attack']==True]
utt_attack_ids = utt_with_attack.index
print(len(utt_attack_ids))

584


In [21]:
test_utters_df = test_corpus.get_utterances_dataframe()
test_utters_df.sample(3)

Unnamed: 0_level_0,timestamp,text,speaker,reply_to,conversation_id,meta.is_section_header,meta.comment_has_personal_attack,meta.toxicity,meta.parsed,meta.politeness_strategies,meta.politeness_markers,meta.prediction,meta.pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
291906834.15472.15472,1243140000.0,==Vandalism==\n,Sherzo,,291906834.15472.15472,True,False,0.0,"[{'rt': 2, 'toks': [{'tok': '=', 'tag': 'SYM',...","{'feature_politeness_==Please==': 0, 'feature_...","{'politeness_markers_==Please==': [], 'politen...",0,0.0833594
324569832.859.859,1257660000.0,== [WIKI_LINK: List of Morphoses productions|M...,Robertgreer,,324569832.859.859,True,False,0.0,"[{'rt': 3, 'toks': [{'tok': '=', 'tag': 'NFP',...","{'feature_politeness_==Please==': 0, 'feature_...","{'politeness_markers_==Please==': [], 'politen...",0,0.0833594
75698786.4387.4387,1158240000.0,I was surprised that so many people were willi...,Steel,75697745.4315.4315,75692435.4172.4172,False,False,0.128427,"[{'rt': 1, 'toks': [{'tok': 'i', 'tag': 'PRP',...","{'feature_politeness_==Please==': 0, 'feature_...","{'politeness_markers_==Please==': [], 'politen...",0,0.0948002


In [22]:
avg_polite_ness = 0
for u_id in utt_attack_ids:
    pred_score = test_utters_df.loc[u_id]['meta.pred_score']
    avg_polite_ness += pred_score

print(avg_polite_ness / len(utt_attack_ids))

0.18280225979968742


In [23]:
utt_without_attack = utterances[utterances['meta.comment_has_personal_attack']==False]
utt_non_attack_ids = utt_without_attack.index
print(len(utt_non_attack_ids))

5779


In [24]:
avg_polite_ness = 0
for u_id in utt_non_attack_ids:
    pred_score = test_utters_df.loc[u_id]['meta.pred_score']
    avg_polite_ness += pred_score

print(avg_polite_ness / len(utt_non_attack_ids))

0.222397808980296


In [25]:
pred_1 = pred_df[pred_df['prediction']==1]
utt_pos_ids = pred_1.index
print(len(utt_pos_ids))

755


In [26]:
cnt = 0
for u_id in utt_pos_ids:
    if test_utters_df.loc[u_id]['meta.comment_has_personal_attack'] == True:
        cnt += 1
        print(test_utters_df.loc[u_id]['meta.pred_score'])
        print(test_utters_df.loc[u_id].text)
print("{} utterence has personal attack but was predicted to be polite.".format(cnt))

0.5194615104243733
What a pathetic joke. False claims of harassment. You pinged me somewhere in the vicinity of six times after I asked you to stop. My first request reads- 'Don't ping me every time. '''This page is on my watch list.''' Thank you.' So besides turning down a polite request, you '''now lie''' to another User. As I said, a pathetic joke., is the complaint department really on 
0.5528330006161977
Lord, I apologize!!!! Even though I'm an atheist and I have a Buddhist girlfriend... Uncle Tech OK, what I was doing was NOT vandalism! Unblock me!! I have a history of good edits. Unblock me!!!! Unblock me, or at least hear me out Seriously, this site isn't the same without me... unblock me!!!!!!!!!!!! Is anybody listening or caring? unblock me unblock me seriously... UNBLOCK ME!!!!!!!!!!!!!! wow...... unblock me unblock me unblock me FUCKING UNBLOCK ME!!!!!!!!! 
0.5565269776610304
Sorry, I have some decorating to do, so I can't start on the picture right now. If you believe I ha

In [27]:
"""
utterance_ids = awry_corpus.get_utterance_ids()
rows = []

cnt = 0
for uid in utterance_ids:
    cnt += 1
    print(awry_corpus.get_utterance(uid).meta)
    if cnt > 2:
        break
    #rows.append(awry_corpus.get_utterance(uid).meta["Normalized Score"])
#politeness_strategies = pd.DataFrame(rows, index=utterance_ids)
"""

'\nutterance_ids = awry_corpus.get_utterance_ids()\nrows = []\n\ncnt = 0\nfor uid in utterance_ids:\n    cnt += 1\n    print(awry_corpus.get_utterance(uid).meta)\n    if cnt > 2:\n        break\n    #rows.append(awry_corpus.get_utterance(uid).meta["Normalized Score"])\n#politeness_strategies = pd.DataFrame(rows, index=utterance_ids)\n'

#### Sentiment

In [28]:
utterances.loc['176824876.14917.14917'].text

' Read the citations to the SurveyUSA polls.  They give the exact language, which only specifies those five candidates.  If Ron Paul\'s name was mentioned, he might poll more than they are showing, because people are more inclined to choose one of the available options.\n::::*How does a "minor" candidate become a first-tier candidate if some of the polls don\'t even specify him as an option?  We aren\'t talking about 1-2% any more.  Paul has clearly risen above the "margin of error" argument at this point.  We should question the results of polls that do not even list him as an option, or allow him to be voted for (as the recording I\'ve linked to shows).  At the very least, we should put an asterisk beside polls which do not allow all primary candidates to be voted for.  "Other" is clearly unreasonable as an option. -'

#### Read in sentiments for each utterance

In [29]:
sents = pd.read_csv('awry_sentiment.csv', index_col='u_id')
sents.head()

Unnamed: 0_level_0,numSentence,numWords,totSentiment,avgSentiment
u_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
146743638.12652.12652,1,4,2.0,2.0
146743638.12667.12652,3,70,4.0,1.333333
146842219.12874.12874,4,86,8.0,2.0
143890867.11926.11926,1,7,2.0,2.0
143890867.11944.11926,4,19,6.0,1.5


In [30]:
for conv in awry_corpus.iter_conversations():
    spk_list = conv.get_chronological_speaker_list()
    spks = []
    for item in spk_list:
        spk = item.id
        spks.append(spk)
#    print(set(spks))


#### Prediction

In [91]:
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import seaborn as sn

In [128]:
data_pol = pd.read_csv('pred_politeness.csv')
data_pol.columns=['pre_polite_1','pre_polite_2','attack_p']
data_pol['attack_p']=data_pol['attack_p'].astype(int)
data_pol = data_pol.rename_axis("Num_con")
data_pol

Unnamed: 0_level_0,pre_polite_1,pre_polite_2,attack_p
Num_con,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.295689,0.737203,1
1,0.234452,0.106919,0
2,0.962891,0.083359,1
3,0.917265,0.189851,0
4,0.126228,0.132515,1
...,...,...,...
1163,0.125389,0.148742,1
1164,0.350113,0.188946,1
1165,0.083359,0.067104,0
1166,0.278233,0.081918,1


In [129]:
data_sen = pd.read_csv('pred_sentiment.csv')
data_sen.columns=['score_sen_1','score_sen_2','attack_s']
data_sen['attack_s']=data_sen['attack_s'].astype(int)
data_sen = data_sen.rename_axis("Num_con")
data_sen

Unnamed: 0_level_0,score_sen_1,score_sen_2,attack_s
Num_con,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.125000,0.166667,1
1,1.000000,0.500000,0
2,0.083333,1.000000,1
3,0.200000,0.200000,0
4,1.000000,0.125000,1
...,...,...,...
1163,0.200000,0.055556,1
1164,0.250000,0.166667,1
1165,1.000000,0.333333,0
1166,0.200000,0.250000,1


In [130]:
data_tal = pd.read_csv('pred_talkativeness.csv')
data_tal.columns=['numword_1','numword_2','attack_t']
data_tal['attack_t']=data_tal['attack_t'].astype(int)
data_tal = data_tal.rename_axis("Num_con")
data_tal

Unnamed: 0_level_0,numword_1,numword_2,attack_t
Num_con,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,122.0,137.0,1
1,17.0,36.0,0
2,122.0,4.0,1
3,44.0,129.0,0
4,4.0,163.0,1
...,...,...,...
1163,110.0,169.0,1
1164,91.0,144.0,1
1165,6.0,37.0,0
1166,69.0,48.0,1


In [131]:
data_uni = pd.concat([data_pol,data_sen,data_tal],axis=1)
data_uni = data_uni.drop(['attack_p','attack_s'],axis=1)
data_uni = data_uni.rename(columns={'attack_t':'attack'})
data_uni

Unnamed: 0_level_0,pre_polite_1,pre_polite_2,score_sen_1,score_sen_2,numword_1,numword_2,attack
Num_con,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.295689,0.737203,0.125000,0.166667,122.0,137.0,1
1,0.234452,0.106919,1.000000,0.500000,17.0,36.0,0
2,0.962891,0.083359,0.083333,1.000000,122.0,4.0,1
3,0.917265,0.189851,0.200000,0.200000,44.0,129.0,0
4,0.126228,0.132515,1.000000,0.125000,4.0,163.0,1
...,...,...,...,...,...,...,...
1163,0.125389,0.148742,0.200000,0.055556,110.0,169.0,1
1164,0.350113,0.188946,0.250000,0.166667,91.0,144.0,1
1165,0.083359,0.067104,1.000000,0.333333,6.0,37.0,0
1166,0.278233,0.081918,0.200000,0.250000,69.0,48.0,1


##### 1. Politeness

In [133]:
from statsmodels.formula.api import logit
data_log_p = logit(formula='attack_p ~ pre_polite_1+pre_polite_2', data=data_pol).fit()
print(data_log_p.summary())

Optimization terminated successfully.
         Current function value: 0.691273
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:               attack_p   No. Observations:                 1168
Model:                          Logit   Df Residuals:                     1165
Method:                           MLE   Df Model:                            2
Date:                Sat, 12 Dec 2020   Pseudo R-squ.:                0.002704
Time:                        16:26:14   Log-Likelihood:                -807.41
converged:                       True   LL-Null:                       -809.60
Covariance Type:            nonrobust   LLR p-value:                    0.1120
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.1640      0.099      1.663      0.096      -0.029       0.357
pre_polite_1    -0.3481

In [144]:
X_p = data_pol[['pre_polite_1','pre_polite_2']]
y_p = np.array(data_pol['attack_p'])
X_p.shape, y_p.shape

((1168, 2), (1168,))

In [145]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_predict

clf = LogisticRegression(solver='liblinear', random_state=0)

In [146]:
scores_p = cross_val_score(clf, X_p, y_p, cv=5)
scores_p

array([0.57692308, 0.4957265 , 0.50854701, 0.53648069, 0.54506438])

In [203]:
def bootstrap_CI(data, nbr_draws):
    means = np.zeros(nbr_draws)
    data = np.array(data)

    for n in range(nbr_draws):
        indices = np.random.randint(0, len(data), len(data))
        data_tmp = data[indices] 
        means[n] = np.nanmean(data_tmp)
        
    dis = np.nanpercentile(means, 97.5)-np.nanpercentile(means, 2.5)
    CI = dis.astype(float)

    return CI

In [231]:
print("Accuracy of politeness: %0.2f%% (+/- %0.3f)" % (np.mean(scores_p)*100, bootstrap_CI(scores_p, 1000)))

Accuracy of politeness: 53.25% (+/- 0.046)


##### 2. Sentiment

In [210]:
data_log_s = logit(formula='attack_s ~ score_sen_1+score_sen_2', data=data_sen).fit()
print(data_log_s.summary())

Optimization terminated successfully.
         Current function value: 0.691150
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:               attack_s   No. Observations:                 1168
Model:                          Logit   Df Residuals:                     1165
Method:                           MLE   Df Model:                            2
Date:                Sat, 12 Dec 2020   Pseudo R-squ.:                0.002882
Time:                        18:38:36   Log-Likelihood:                -807.26
converged:                       True   LL-Null:                       -809.60
Covariance Type:            nonrobust   LLR p-value:                   0.09700
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       0.1045      0.133      0.784      0.433      -0.157       0.366
score_sen_1     0.2135    

In [211]:
X_s = data_sen[['score_sen_1','score_sen_2']]
y_s = np.array(data_sen['attack_s'])
X_s.shape, y_s.shape

((1168, 2), (1168,))

In [212]:
scores_s = cross_val_score(clf, X_s, y_s, cv=5)
scores_s

array([0.52564103, 0.48290598, 0.48717949, 0.53218884, 0.55364807])

In [230]:
print("Accuracy of sensitiveness: %0.2f%% (+/- %0.3f)" % (np.mean(scores_s)*100, bootstrap_CI(scores_s, 1000)))

Accuracy of sensitiveness: 51.63% (+/- 0.047)


##### 3. Talkativeness

In [155]:
data_log_t = logit(formula='attack_t ~ numword_1+numword_2', data=data_tal).fit()
print(data_log_s.summary())

Optimization terminated successfully.
         Current function value: 0.692528
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:               attack_s   No. Observations:                 1168
Model:                          Logit   Df Residuals:                     1165
Method:                           MLE   Df Model:                            2
Date:                Sat, 12 Dec 2020   Pseudo R-squ.:                0.002882
Time:                        16:44:53   Log-Likelihood:                -807.26
converged:                       True   LL-Null:                       -809.60
Covariance Type:            nonrobust   LLR p-value:                   0.09700
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       0.1045      0.133      0.784      0.433      -0.157       0.366
score_sen_1     0.2135    

In [172]:
X_t = data_tal[['numword_1','numword_2']]
y_t = np.array(data_tal['attack_t'])
X_t.shape, y_t.shape

((1168, 2), (1168,))

In [173]:
scores_t = cross_val_score(clf, X_t, y_t, cv=5)
scores_t

array([0.52991453, 0.46153846, 0.51282051, 0.5193133 , 0.50643777])

In [237]:
print("Accuracy of talkativeness: %0.2f%% (+/- %0.3f)" % (np.mean(scores_t)*100, bootstrap_CI(scores_t, 1000)))

Accuracy of talkativeness: 50.60% (+/- 0.039)


##### 4. Unitivity

In [175]:
data_log_u = logit(formula='attack ~ pre_polite_1+pre_polite_2+score_sen_1+score_sen_2+numword_1+numword_2', data=data_uni).fit()
print(data_log_u.summary())

Optimization terminated successfully.
         Current function value: 0.687183
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:                 attack   No. Observations:                 1168
Model:                          Logit   Df Residuals:                     1161
Method:                           MLE   Df Model:                            6
Date:                Sat, 12 Dec 2020   Pseudo R-squ.:                0.008604
Time:                        16:53:33   Log-Likelihood:                -802.63
converged:                       True   LL-Null:                       -809.60
Covariance Type:            nonrobust   LLR p-value:                   0.03041
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.7499      0.267      2.805      0.005       0.226       1.274
pre_polite_1    -0.3254

In [176]:
X_u = data_uni[['pre_polite_1','pre_polite_2','score_sen_1','score_sen_2','numword_1','numword_2']]
y_u = np.array(data_uni['attack'])
X_u.shape, y_u.shape

((1168, 6), (1168,))

In [178]:
scores_u = cross_val_score(clf, X_u, y_u, cv=5)
scores_u

array([0.53846154, 0.50854701, 0.4957265 , 0.55364807, 0.54077253])

In [233]:
print("Accuracy of politeness: %0.2f%% (+/- %0.3f)" % (np.mean(scores_p)*100, bootstrap_CI(scores_p, 1000)))
print("Accuracy of sensitiveness: %0.2f%% (+/- %0.3f)" % (np.mean(scores_s)*100, bootstrap_CI(scores_s, 1000)))
print("Accuracy of talkativeness: %0.2f%% (+/- %0.3f)" % (np.mean(scores_t)*100, bootstrap_CI(scores_t, 1000)))
print("Accuracy across all selected features: %0.2f%% (+/- %0.3f)" % (np.mean(scores_u)*100, bootstrap_CI(scores_u, 1000)))

Accuracy of politeness: 53.25% (+/- 0.049)
Accuracy of sensitiveness: 51.63% (+/- 0.046)
Accuracy of talkativeness: 50.60% (+/- 0.040)
Accuracy across all selected features: 52.74% (+/- 0.039)
