In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [2]:
df1 = pd.read_csv('attack_data/attack_annotated_comments.tsv',sep='\t')
df2 = pd.read_csv('attack_data/attack_annotations.tsv',sep='\t')

In [3]:
df1.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,2002,False,article,random,train
2,49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",2002,False,article,random,train
3,89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev
4,93890,This page will need disambiguation.,2002,True,article,random,train


In [4]:
df2.head()

Unnamed: 0,rev_id,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
0,37675,1362,0.0,0.0,0.0,0.0,0.0
1,37675,2408,0.0,0.0,0.0,0.0,0.0
2,37675,1493,0.0,0.0,0.0,0.0,0.0
3,37675,1439,0.0,0.0,0.0,0.0,0.0
4,37675,170,0.0,0.0,0.0,0.0,0.0


In [5]:
df2.describe()

Unnamed: 0,rev_id,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
count,1365217.0,1365217.0,1365217.0,1365217.0,1365217.0,1365217.0,1365217.0
mean,299974500.0,1339.286,0.007524811,0.1117822,0.03264756,0.03231794,0.1669595
std,198421400.0,1053.387,0.08641871,0.3150985,0.1777125,0.1768432,0.3729399
min,37675.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,126867700.0,445.0,0.0,0.0,0.0,0.0,0.0
50%,269997600.0,1079.0,0.0,0.0,0.0,0.0,0.0
75%,459291100.0,2062.0,0.0,0.0,0.0,0.0,0.0
max,699897200.0,4052.0,1.0,1.0,1.0,1.0,1.0


In [6]:
grouped = df2.groupby('rev_id').sum().reset_index()

In [7]:
grouped.head()

Unnamed: 0,rev_id,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
0,37675,9721,0.0,0.0,0.0,0.0,0.0
1,44816,20234,0.0,0.0,0.0,0.0,0.0
2,49851,26474,0.0,0.0,0.0,0.0,0.0
3,89320,26738,0.0,2.0,0.0,2.0,4.0
4,93890,8010,0.0,0.0,0.0,0.0,0.0


In [8]:
df = pd.merge(df1,grouped,left_index=True,right_index=True,how='left',on='rev_id')

In [9]:
df.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train,9721,0.0,0.0,0.0,0.0,0.0
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,2002,False,article,random,train,20234,0.0,0.0,0.0,0.0,0.0
2,49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",2002,False,article,random,train,26474,0.0,0.0,0.0,0.0,0.0
3,89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev,26738,0.0,2.0,0.0,2.0,4.0
4,93890,This page will need disambiguation.,2002,True,article,random,train,8010,0.0,0.0,0.0,0.0,0.0


In [10]:
df.drop(columns=['year','logged_in','ns','sample','split','worker_id'],inplace=True)

In [11]:
rev_id_counts = {}
previous = 0
count = 0
for rev in df2['rev_id']:
    if rev == previous:
        count += 1
        previous = rev
    else:
        rev_id_counts[rev] = count
        count = 0
        previous = rev

In [12]:
len(rev_id_counts)

115864

In [13]:
df2[df2['rev_id'] == 37675].shape[0]

10

In [14]:
rev_id_counts[37675] = 10

In [15]:
df['num_reviews'] = df['rev_id'].map(rev_id_counts)

In [16]:
df.head()

Unnamed: 0,rev_id,comment,quoting_attack,recipient_attack,third_party_attack,other_attack,attack,num_reviews
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,0.0,0.0,0.0,0.0,0.0,10
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,0.0,0.0,0.0,0.0,0.0,9
2,49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",0.0,0.0,0.0,0.0,0.0,8
3,89320,"Next, maybe you could work on being less cond...",0.0,2.0,0.0,2.0,4.0,9
4,93890,This page will need disambiguation.,0.0,0.0,0.0,0.0,0.0,8


In [17]:
def remove_junk(comment):
    if 'NEWLINE_TOKEN' in comment:
        comment = comment.replace('NEWLINE_TOKEN','')
    for char in comment.lower():
        if char not in '1234567890abcdefghijklmnopqrstuvwxyz ':
            comment = comment.replace(char,'')
    return comment

In [18]:
df['comment'] = df['comment'].apply(remove_junk)

In [19]:
df.head()

Unnamed: 0,rev_id,comment,quoting_attack,recipient_attack,third_party_attack,other_attack,attack,num_reviews
0,37675,This is not creative Those are the dictionary...,0.0,0.0,0.0,0.0,0.0,10
1,44816,the term standard model is itself less NPOV t...,0.0,0.0,0.0,0.0,0.0,9
2,49851,True or false the situation as of March 2002 w...,0.0,0.0,0.0,0.0,0.0,8
3,89320,Next maybe you could work on being less conde...,0.0,2.0,0.0,2.0,4.0,9
4,93890,This page will need disambiguation,0.0,0.0,0.0,0.0,0.0,8


In [20]:
df['comment'].value_counts()[:5]

Thank you for experimenting with  Wikipedia Your test worked and it has been reverted or removed Please use the sandbox for any other tests you may want to do Take a look at the welcome page to learn more about contributing to our encyclopedia       21
Please do not add nonsense to Wikipedia It is considered vandalism If you would like to experiment use the sandbox Thank you                                                                                                                              11
                                                                                                                                                                                                                                                          10
Thank you for experimenting with  Wikipedia Your test worked and it has been reverted or removed Please use the sandbox for any other tests you may want to do Take a look at the welcome page to learn more about contributing to our encycloped

In [21]:
df['comment'].value_counts().index.to_list()[:5]

['Thank you for experimenting with  Wikipedia Your test worked and it has been reverted or removed Please use the sandbox for any other tests you may want to do Take a look at the welcome page to learn more about contributing to our encyclopedia   ',
 'Please do not add nonsense to Wikipedia It is considered vandalism If you would like to experiment use the sandbox Thank you    ',
 '   ',
 'Thank you for experimenting with  Wikipedia Your test worked and it has been reverted or removed Please use the sandbox for any other tests you may want to do Take a look at the welcome page to learn more about contributing to our encyclopedia  ',
 '    ']

In [22]:
def whitespace(comment):
    return comment.strip()

In [23]:
df['comment'] = df['comment'].apply(whitespace)

In [24]:
df['comment'].value_counts()[:5]

Thank you for experimenting with  Wikipedia Your test worked and it has been reverted or removed Please use the sandbox for any other tests you may want to do Take a look at the welcome page to learn more about contributing to our encyclopedia    48
Preceding unsigned comment added by                                                                                                                                                                                                                    31
Please do not add nonsense to Wikipedia It is considered vandalism If you would like to experiment use the sandbox Thank you                                                                                                                           30
                                                                                                                                                                                                                                                       25


In [25]:
weird_scores = df['attack'][df['comment'] == "Thank you for experimenting with  Wikipedia Your test worked and it has been reverted or removed Please use the sandbox for any other tests you may want to do Take a look at the welcome page to learn more about contributing to our encyclopedia"].to_list()

In [26]:
sum(weird_scores)/len(weird_scores) #why are these comments scored at all??

0.125

In [27]:
df[df['comment'] == ''].shape[0]

25

In [28]:
df = df[df['comment'] != '']

In [29]:
df.head()

Unnamed: 0,rev_id,comment,quoting_attack,recipient_attack,third_party_attack,other_attack,attack,num_reviews
0,37675,This is not creative Those are the dictionary...,0.0,0.0,0.0,0.0,0.0,10
1,44816,the term standard model is itself less NPOV th...,0.0,0.0,0.0,0.0,0.0,9
2,49851,True or false the situation as of March 2002 w...,0.0,0.0,0.0,0.0,0.0,8
3,89320,Next maybe you could work on being less condes...,0.0,2.0,0.0,2.0,4.0,9
4,93890,This page will need disambiguation,0.0,0.0,0.0,0.0,0.0,8


In [42]:
def lower(comment):
    return comment.lower()

In [43]:
df['comment'] = df['comment'].apply(lower)

In [46]:
def tokenize(comment):
    tokenizer = RegexpTokenizer('\w+')
    return tokenizer.tokenize(comment)

In [47]:
df['tokens'] = df['comment'].apply(tokenize)

In [48]:
df.head()

Unnamed: 0,rev_id,comment,quoting_attack,recipient_attack,third_party_attack,other_attack,attack,num_reviews,tokens
0,37675,this is not creative those are the dictionary...,0.0,0.0,0.0,0.0,0.0,10,"[this, is, not, creative, those, are, the, dic..."
1,44816,the term standard model is itself less npov th...,0.0,0.0,0.0,0.0,0.0,9,"[the, term, standard, model, is, itself, less,..."
2,49851,true or false the situation as of march 2002 w...,0.0,0.0,0.0,0.0,0.0,8,"[true, or, false, the, situation, as, of, marc..."
3,89320,next maybe you could work on being less condes...,0.0,2.0,0.0,2.0,4.0,9,"[next, maybe, you, could, work, on, being, les..."
4,93890,this page will need disambiguation,0.0,0.0,0.0,0.0,0.0,8,"[this, page, will, need, disambiguation]"


In [None]:
#next cvec