## Conversation Gone Awry

In [1]:
import os

import numpy as np
import pandas as pd
from scipy import stats

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, LeaveOneGroupOut
from sklearn.feature_selection import f_classif, SelectPercentile

from collections import defaultdict
from functools import partial
from multiprocessing import Pool

from convokit import download
from convokit.prompt_types import PromptTypeWrapper
from convokit import PolitenessStrategies
from convokit import Corpus, Speaker, Utterance
from convokit import Classifier

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import warnings
warnings.filterwarnings('ignore')

#### Load dataset

In [4]:
# OPTION 1: DOWNLOAD CORPUS 
# UNCOMMENT THESE LINES TO DOWNLOAD CORPUS
# DATA_DIR = ''
#AWRY_ROOT_DIR = download('conversations-gone-awry-corpus', data_dir=DATA_DIR)

# OPTION 2: READ PREVIOUSLY-DOWNLOADED CORPUS FROM DISK
# UNCOMMENT THIS LINE AND REPLACE WITH THE DIRECTORY WHERE THE TENNIS-CORPUS IS LOCATED
AWRY_ROOT_DIR = 'E:\EPFL_Courses\ADA\milestone-p3_data\conversations-gone-awry-corpus'

awry_corpus = Corpus(AWRY_ROOT_DIR)
awry_corpus.load_info('utterance',['parsed'])

awry_corpus = awry_corpus.filter_conversations_by(lambda convo: convo.meta['annotation_year'] == '2018')

In [5]:
awry_corpus.print_summary_stats()

Number of Speakers: 2010
Number of Utterances: 6363
Number of Conversations: 1168


In [9]:
awry_corpus.get_speakers_dataframe()

Unnamed: 0_level_0,vectors
id,Unnamed: 1_level_1
Sirex98,[]
2005,[]
WilyD,[]
H20h0us391,[]
Billzilla,[]
...,...
Seregain,[]
Gabr-el,[]
Raspor,[]
FeloniousMonk,[]


In [15]:
for conv in awry_corpus.iter_conversations():
    spk_list = conv.get_chronological_speaker_list()
    spks = []
    for item in spk_list:
        spk = item.id
        spks.append(spk)
    print(set(spks))

{'2005', 'Sirex98'}
{'2005', 'WilyD'}
{'H20h0us391', 'Billzilla'}
{'The Evil Spartan', 'AuburnPilot', 'Billzilla'}
{'Morton devonshire', 'Captainktainer', 'Commodore Sloat'}
{'Commodore Sloat', 'RonCram'}
{'Rueben lys', 'Jaiiaf', 'Idleguy', 'Kazimostak'}
{'Idleguy', '129.100.224.185'}
{'Can.u.spel', 'Alex32165', 'Greswik'}
{'Nautilogos', 'Dr. Blofeld', 'Greswik'}
{'Ferociouslettuce', 'Extransit'}
{'Orangemike', 'Wangtopgun'}
{'134.173.61.53', 'Skomorokh', 'TallNapoleon', 'SteveWolfer'}
{'Stevewunder', 'SteveWolfer', 'Snowded', 'ChildofMidnight'}
{'Filll', 'Hrafn', 'NCdave'}
{'Hrafn', 'BigBang616'}
{'Achilton', 'Hrafn', 'WLU'}
{'Hrafn', 'Mike0001'}
{'Milowent', 'SRMach5B'}
{'Milowent', 'JamesBWatson', 'Otterathome'}
{'Daicaregos', 'Malleus Fatuorum'}
{'Eric Corbett', 'Dougatwiki'}
{'Jasonbres', 'Acps110'}
{'AndyTheCop', 'Jasonbres'}
{'ConfuciusOrnis', 'Nv8200pa', 'Orangemarlin'}
{'ConfuciusOrnis', 'ZayZayEM'}
{'ViriiK', 'Alan.ca', 'Ryulong', 'Colin Keigher'}
{'Ryulong', 'Zaphnathpaaneah

In [8]:
awry_corpus.get_conversations_dataframe()

Unnamed: 0_level_0,vectors,meta.page_title,meta.page_id,meta.pair_id,meta.conversation_has_personal_attack,meta.verified,meta.pair_verified,meta.annotation_year,meta.split
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
146743638.12652.12652,[],User talk:2005,1003212,143890867.11926.11926,False,True,True,2018,train
143890867.11926.11926,[],User talk:2005,1003212,146743638.12652.12652,True,True,True,2018,train
127296808.516.516,[],User talk:Billzilla,10051321,144643838.1236.1236,False,True,True,2018,train
144643838.1236.1236,[],User talk:Billzilla,10051321,127296808.516.516,True,True,True,2018,train
66813686.23567.23567,[],Talk:Niger uranium forgeries,1005730,68000691.25417.25417,False,True,True,2018,train
...,...,...,...,...,...,...,...,...,...
278765498.27104.27104,[],User talk:Grundle2600,10902498,275409490.16844.16844,True,True,True,2018,train
345472052.2707.2707,[],User talk:Aunt Entropy,15801810,236755381.13326.13326,False,True,True,2018,test
236755381.13326.13326,[],User talk:Aunt Entropy,15801810,345472052.2707.2707,True,True,True,2018,test
99357960.32952.32952,[],User talk:Raspor,8071956,98071983.10061.9817,False,True,True,2018,test


In [6]:
utterances = awry_corpus.get_utterances_dataframe()

In [7]:
utterances.sample(10)

Unnamed: 0_level_0,timestamp,text,speaker,reply_to,conversation_id,meta.is_section_header,meta.comment_has_personal_attack,meta.toxicity,meta.parsed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
116610974.14189.14189,1174420000.0,"Hi Itaq, Do not vandalize content on Wikipedia...",Giordaano,116605439.14082.14054,116605439.14054.14054,False,False,0.119089,"[{'rt': 0, 'toks': [{'tok': 'Hi', 'tag': 'UH',..."
399183585.5117.5117,1290880000.0,"**The article is on my watchlist, which is why...",Baseball Bugs,399183053.4932.4932,399172725.4008.4008,False,False,0.388942,"[{'rt': 0, 'toks': [{'tok': '*', 'tag': 'NFP',..."
492928306.46993.46993,1337200000.0,Bullshit. Dahliarose has restored at least on...,Charlesdrakew,492927701.46914.46914,492922820.46400.46400,False,True,0.820106,"[{'rt': 1, 'toks': [{'tok': ' ', 'tag': '', 'd..."
44899832.36398.36398,1143000000.0,You have just admited that you did this to ma...,Zeq,44873805.36322.36322,44828642.35766.35766,False,False,0.048639,"[{'rt': 4, 'toks': [{'tok': ' ', 'tag': '', 'd..."
24164771.13558.13558,1127840000.0,This article is nothing but a bunch of 's opin...,Howrealisreal,23619937.12859.12924,23619937.12859.12924,False,True,0.630584,"[{'rt': 2, 'toks': [{'tok': 'This', 'tag': 'DT..."
521278464.7241.7241,1351990000.0,The don't-intersperse-on-talk-pages policy is...,Scheinwerfermann,521171825.6632.6632,521109525.5453.5453,False,False,0.255673,"[{'rt': 12, 'toks': [{'tok': ' ', 'tag': '', '..."
270328244.28512.28512,1234480000.0,In this case it's not really about facts.. it'...,Spanneraol,270311808.28417.28417,270177839.28034.28034,False,False,0.0339964,"[{'rt': 13, 'toks': [{'tok': 'In', 'tag': 'IN'..."
321002856.2975.2975,1256050000.0,The whole thing needs to be removed per wiki_l...,Benjiboi,320926790.13705.13705,320926675.13624.13460,False,False,0.30627,"[{'rt': 3, 'toks': [{'tok': 'The', 'tag': 'DT'..."
286787641.8951.8951,1240980000.0,I think this could be solved by saying that Ca...,Awickert,286761180.8636.8636,286761180.8636.8636,False,False,0.11616,"[{'rt': 1, 'toks': [{'tok': 'I', 'tag': 'PRP',..."
658888428.902.902,1429820000.0,"There is something that heavily concerns me, ...",Dandtiks69,658888258.158.158,658888258.158.158,False,False,0.111868,"[{'rt': 2, 'toks': [{'tok': ' ', 'tag': '', 'd..."


In [None]:
"""
cnt = 0
for utt in awry_corpus.iter_utterances():
    cnt += 1
    print(ps.transform_utterance(utt))
    if cnt > 2:
        break
"""

#### Predict Politeness: Use wiki_corpus for training, then apply the classifier on awry_corpus to get the prediction score

In [7]:
# Downloading the wikipedia portion of annotated data
wiki_corpus = Corpus(download("wikipedia-politeness-corpus"))

Dataset already exists at C:\Users\Think\.convokit\downloads\wikipedia-politeness-corpus


In [8]:
wiki_corpus.print_summary_stats()

Number of Speakers: 1
Number of Utterances: 4353
Number of Conversations: 4353


In [9]:
ps = PolitenessStrategies()
awry_corpus = ps.transform(awry_corpus, markers=True)
wiki_corpus = ps.transform(wiki_corpus, markers=True)

In [10]:
test_ids = awry_corpus.get_utterance_ids()

In [11]:
train_corpus = Corpus(utterances=[utt for utt in wiki_corpus.iter_utterances()])
test_corpus = Corpus(utterances=[utt for utt in awry_corpus.iter_utterances()])
print("train size = {}, test size = {}".format(len(train_corpus.get_utterance_ids()),
                                               len(test_corpus.get_utterance_ids())))

train size = 4353, test size = 6363


In [12]:
clf = Classifier(obj_type="utterance", 
                        pred_feats=["politeness_strategies"], 
                        labeller=lambda utt: utt.meta['Binary'] == 1)
clf.fit(train_corpus)

Initialized default classification model (standard scaled logistic regression).


<convokit.classifier.classifier.Classifier at 0x248165ad970>

In [13]:
test_pred = clf.transform(test_corpus)

In [14]:
pred_df = clf.summarize(test_pred)
pred_df.head(20)

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
132662410.11977.11977,0,0.009742
410838569.61940.61940,0,0.019028
236161634.27143.27131,0,0.020131
444898053.3389.3389,0,0.020926
418695946.3459.3449,0,0.020926
36146600.3862.3862,0,0.021219
606915191.4713.4713,0,0.022217
408191172.17962.17962,0,0.022496
344941369.12789.12789,0,0.023541
123265296.5701.5701,0,0.024656


In [20]:
test_utters_df = test_corpus.get_utterances_dataframe()
test_utters_df.sample(3)

Unnamed: 0_level_0,timestamp,text,speaker,reply_to,conversation_id,meta.is_section_header,meta.comment_has_personal_attack,meta.toxicity,meta.parsed,meta.politeness_strategies,meta.politeness_markers,meta.prediction,meta.pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
272953472.1207.1207,1235500000.0,== Could you have a look please ==\n,BigDunc,,272953472.1207.1207,True,False,0.0,"[{'rt': 9, 'toks': [{'tok': '=', 'tag': 'NFP',...","{'feature_politeness_==Please==': 1, 'feature_...","{'politeness_markers_==Please==': [[('please',...",1,0.503996
92960315.5680.5680,1165620000.0,\n== Do you delete this file? ==,Jouvenel,,92960315.5680.5680,True,False,0.0,"[{'rt': 5, 'toks': [{'tok': '\n', 'tag': '', '...","{'feature_politeness_==Please==': 0, 'feature_...","{'politeness_markers_==Please==': [], 'politen...",0,0.13428
80105638.13140.13140,1160260000.0,I specified in talk. This article was just cr...,Amoruso,80104056.13083.13083,80103228.13014.13014,False,False,0.0386518,"[{'rt': 2, 'toks': [{'tok': ' ', 'tag': '', 'd...","{'feature_politeness_==Please==': 0, 'feature_...","{'politeness_markers_==Please==': [], 'politen...",0,0.0804226


In [21]:
test_utters_df.loc['473786014.2964.2956'].text

' I dont want to seem rude or anything, but is there a reason external_link has gone unanswered and several have been resolved below? This editor and his suspected sockpuppets are continuing to commit rogue edits and action needs to be taken. Its getting frustrating to work on the page in question. Thanks,'

In [22]:
utt_with_attack = test_utters_df[test_utters_df['meta.comment_has_personal_attack']==True]
utt_attack_ids = utt_with_attack.index
print(len(utt_attack_ids))

584


In [23]:
avg_polite_ness = 0
for u_id in utt_attack_ids:
    pred_score = test_utters_df.loc[u_id]['meta.pred_score']
    avg_polite_ness += pred_score

print(avg_polite_ness / len(utt_attack_ids))

0.18280225979968698


In [24]:
utt_without_attack = test_utters_df[test_utters_df['meta.comment_has_personal_attack']==False]
utt_non_attack_ids = utt_without_attack.index
print(len(utt_non_attack_ids))

5779


In [25]:
avg_polite_ness = 0
for u_id in utt_non_attack_ids:
    pred_score = test_utters_df.loc[u_id]['meta.pred_score']
    avg_polite_ness += pred_score

print(avg_polite_ness / len(utt_non_attack_ids))

0.22239780898029585


In [26]:
pred_1 = pred_df[pred_df['prediction']==1]
utt_pos_ids = pred_1.index
print(len(utt_pos_ids))

755


In [None]:
cnt = 0
for u_id in utt_pos_ids:
    if test_utters_df.loc[u_id]['meta.comment_has_personal_attack'] == True:
        cnt += 1
        print(test_utters_df.loc[u_id]['meta.pred_score'])
        print(test_utters_df.loc[u_id].text)
print("{} utterence has personal attack but was predicted to be polite.".format(cnt))

In [None]:
"""
utterance_ids = awry_corpus.get_utterance_ids()
rows = []

cnt = 0
for uid in utterance_ids:
    cnt += 1
    print(awry_corpus.get_utterance(uid).meta)
    if cnt > 2:
        break
    #rows.append(awry_corpus.get_utterance(uid).meta["Normalized Score"])
#politeness_strategies = pd.DataFrame(rows, index=utterance_ids)
"""

#### Sentiment

In [84]:
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

# Function; Output = # sentence, # words, avg.sentimentValue, sentimentHist
def stanford_sentiment(text_str):
    res = nlp.annotate(text_str,
                   properties={
                       'annotators': 'sentiment',
                       'outputFormat': 'json',
                       'timeout': 40000,
                   })
    
    numSentence = len(res["sentences"])
    numWords = len(text_str.split())
    
    # data arrangement
    arraySentVal = np.zeros(numSentence)

    for i, s in enumerate(res["sentences"]):
        arraySentVal[i] = int(s["sentimentValue"])
        #print(int(s["sentimentValue"]), s["sentiment"])

    # sum of sentiment values 
    totSentiment = sum(arraySentVal)

    # avg. of sentiment values 
    avgSentiment = np.mean(arraySentVal)
    #print(avgSentiment)

    # frequency of sentimentValue
    bins = [0,1,2,3,4,5,6]
    freq = np.histogram(arraySentVal, bins)[0]    # getting freq. only w/o bins

    return(numSentence, numWords, totSentiment, avgSentiment, freq)

In [11]:
awry_ids = awry_corpus.get_utterance_ids()

In [20]:
col_names =  ['u_id', 'numSentence', 'numWords', 'totSentiment', 'avgSentiment']
df = pd.DataFrame(columns=col_names)

print("Total utterance: ", len(awry_ids))
cnt = 0
for uid in awry_ids:
    cnt += 1
    if cnt % 100 == 0:
        print(cnt)
    try:
        numSentence, numWords, totSentiment, avgSentiment, freq = stanford_sentiment(utterances.loc[uid].text)
        df.loc[len(df)] = [uid, numSentence, numWords, totSentiment, avgSentiment]
    except:
        print("error where uid =", uid)
    

outputFile = 'awry_sentiment.csv'
df.to_csv(outputFile, encoding='utf-8', index=False )

Total utterance:  6363
error where uid = 146860774.13072.13072
error where uid = 127772860.903.903
100
200
error where uid = 370760170.48365.48343
300
error where uid = 203503939.64709.64709
error where uid = 203507415.65166.65166
error where uid = 203516212.65308.65308
error where uid = 316041793.7429.7429
400
error where uid = 188454964.16448.16448
500
600
700
800
error where uid = 128818551.16576.16567
900
1000
1100
1200
error where uid = 448205040.3440.3432
1300
1400
1500
1600
error where uid = 263284935.3024.3024
error where uid = 320164597.6282.6282
1700
error where uid = 240444038.11636.11624
error where uid = 240849934.12703.12703
error where uid = 276926669.1597.1597
1800
1900
2000
error where uid = 38034311.3048.3048
error where uid = 38043447.3278.3278
error where uid = 38163583.3365.3365
2100
2200
2300
2400
error where uid = 67570639.698.698
2500
error where uid = 380375467.1864.1864
error where uid = 380377424.2080.2080
2600
2700
2800
2900
error where uid = 691647753.5281.

In [85]:
stanford_sentiment(utterances.loc['34591966.61268.61268'].text)

1 Negative
1 Negative
1 Negative
2 Neutral
2 Neutral
1 Negative
1 Negative
1 Negative
1.25


(8, 141, 10.0, 1.25, array([0, 6, 2, 0, 0, 0], dtype=int64))

In [86]:
utterances.loc['34591966.61268.61268'].text

'The parts I deleted were an unsubstantiated rant about ethnicity in Iran in the human rights section. It was little more than propaganda with no attempt to point to secondary material and anyway the English was appalling. I decided to make references to reports by HRW, Amnesty and UNCHR (I don\'t think these are pan-Arab or anti-Persian organisations), but also added a "criticism" section countering human rights allegations. I believe I summarised many of the points made before. But if you would like to expand on this and perhaps give reference to other secondary material substantiating the arguments, then go ahead. You, Zereshk, have made a point about the importance of secondary material to counter \'s analysis of Khuzestan\'s history. I believe I have met the standards of proof you are demanding. But this does not seem to be good enough. '

#### Read in sentiments for each utterance

In [27]:
sents = pd.read_csv('awry_sentiment.csv', index_col='u_id')
sents.head()

Unnamed: 0_level_0,numSentence,numWords,totSentiment,avgSentiment
u_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
146743638.12652.12652,1,4,2.0,2.0
146743638.12667.12652,3,70,4.0,1.333333
146842219.12874.12874,4,86,8.0,2.0
143890867.11926.11926,1,7,2.0,2.0
143890867.11944.11926,4,19,6.0,1.5
