In [129]:
import csv
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# %matplotlib inline
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import RepeatedStratifiedKFold
# import sklearn
import re

### Read in files
#### Make sure current working directory is AbusiveLanguageAnalysis

In [130]:
df_comments = pd.read_csv('./data/aggression_annotated_comments.tsv', sep='\t')
df_annotations = pd.read_csv('./data/aggression_annotations.tsv', sep='\t')

## Bunch of EDA
#### Change the threshold of what constitutes an aggresion

In [131]:
threshold = 0.5

In [132]:
df_comments.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,2002,True,article,random,train
2,49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",2002,True,article,random,train
3,89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev
4,93890,This page will need disambiguation.,2002,True,article,random,train


In [133]:
df_annotations.head()

Unnamed: 0,rev_id,worker_id,aggression,aggression_score
0,37675,1362,1.0,-1.0
1,37675,2408,0.0,1.0
2,37675,1493,0.0,0.0
3,37675,1439,0.0,0.0
4,37675,170,0.0,0.0


### Join the annotations with the comments to get both the text ('comment' column) and the outcome ('aggression' column)
Also, assign whether or not a comment is aggressive

In [134]:
df = df_comments.join(df.groupby('rev_id')['aggression'].agg('mean').to_frame().reset_index().set_index('rev_id'), on='rev_id')
df.aggression = df.aggression.apply(lambda x: 1 if x > threshold else 0)
df

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,aggression
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,2002,True,article,random,train,0
2,49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",2002,True,article,random,train,0
3,89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev,0
4,93890,This page will need disambiguation.,2002,True,article,random,train,0
5,102817,NEWLINE_TOKEN-NEWLINE_TOKENNEWLINE_TOKENImport...,2002,True,user,random,train,0
6,103624,I removed the following:NEWLINE_TOKENNEWLINE_T...,2002,True,article,random,train,0
7,111032,`:If you ever claimed in a Judaic studies prog...,2002,True,article,random,dev,0
8,120283,NEWLINE_TOKENNEWLINE_TOKENNEWLINE_TOKENMy apol...,2002,True,article,random,dev,0
9,128532,"`Someone wrote:NEWLINE_TOKENMore recognizable,...",2002,False,article,random,train,0


#### Remove the NEWLINE_TOKEN and TAB_TOKEN

In [135]:
df['comment'] = df['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
df['comment'] = df['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))
df['comment'] = df['comment'].apply(lambda x: re.sub(r"((?::|;|=)(?:-)?(?:\)|D|P))", "", x))
df['comment'] = df['comment'].apply(lambda x: re.sub(r"[\"`~=:<>@*-]+", "", x))
df

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,aggression
0,37675,This is not creative. Those are the dictiona...,2002,True,article,random,train,0
1,44816,the term standard model is itself less NPOV...,2002,True,article,random,train,0
2,49851,"True or false, the situation as of March 200...",2002,True,article,random,train,0
3,89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev,0
4,93890,This page will need disambiguation.,2002,True,article,random,train,0
5,102817,Important note for all sysops There is a bu...,2002,True,user,random,train,0
6,103624,I removed the following All names of early Po...,2002,True,article,random,train,0
7,111032,If you ever claimed in a Judaic studies progra...,2002,True,article,random,dev,0
8,120283,"My apologies I'm English, I watch cricket,...",2002,True,article,random,dev,0
9,128532,"Someone wrote More recognizable, perhaps, is a...",2002,False,article,random,train,0


Rename the comment column

In [136]:
df = df.rename(columns={'rev_id': 'rev_id', 'comment': 'text', 'year': 'year', 'logged_in': 'logged_in', 'ns': 'ns', 'sample': 'sample', 'split': 'split', 'aggression': 'aggression'})
df

Unnamed: 0,rev_id,text,year,logged_in,ns,sample,split,aggression
0,37675,This is not creative. Those are the dictiona...,2002,True,article,random,train,0
1,44816,the term standard model is itself less NPOV...,2002,True,article,random,train,0
2,49851,"True or false, the situation as of March 200...",2002,True,article,random,train,0
3,89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev,0
4,93890,This page will need disambiguation.,2002,True,article,random,train,0
5,102817,Important note for all sysops There is a bu...,2002,True,user,random,train,0
6,103624,I removed the following All names of early Po...,2002,True,article,random,train,0
7,111032,If you ever claimed in a Judaic studies progra...,2002,True,article,random,dev,0
8,120283,"My apologies I'm English, I watch cricket,...",2002,True,article,random,dev,0
9,128532,"Someone wrote More recognizable, perhaps, is a...",2002,False,article,random,train,0


In [137]:
df.to_csv("./data/cleaned_data.csv", index=False)

In [138]:
df.groupby('aggression').agg('count')

Unnamed: 0_level_0,rev_id,text,year,logged_in,ns,sample,split
aggression,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,101082,101082,101082,101082,101082,101082,101082
1,14782,14782,14782,14782,14782,14782,14782
