In [1]:
import pandas as pd
import numpy as np
from eda_functions import merge_with_target, split_data
import warnings

In [2]:
df_comments = pd.read_csv('../data/aggression_annotated_comments.tsv', sep='\t')
df_comments.head(3)

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,2002,True,article,random,train
2,49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",2002,True,article,random,train


In [3]:
df_scores = pd.read_csv('../data/aggression_annotations.tsv', sep='\t')
df_scores.head(3)

Unnamed: 0,rev_id,worker_id,aggression,aggression_score
0,37675,1362,1.0,-1.0
1,37675,2408,0.0,1.0
2,37675,1493,0.0,0.0


In [4]:
df = merge_with_target(df_comments, df_scores, 'aggression')

In [5]:
df.head()

Unnamed: 0_level_0,comment,target
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1
37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,0
44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,0
49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",0
89320,"Next, maybe you could work on being less cond...",0
93890,This page will need disambiguation.,0


In [40]:
def split_data(data, pct_positive, test_size=None, train_size=None, random_state=None):
    
    if test_size:
        num_test_samples = int(test_size)
    else:
        num_test_samples = int(0.3 * data.shape[0])
        
    df_test = data.sample(num_test_samples, random_state=random_state)
    X_test = df_test['comment']
    y_test = df_test['target']
    
    df_train = data.drop(df_test.index)
    df_train_pos = df_train[df_train['target'] == 1]
    df_train_neg = df_train[df_train['target'] == 0]
    
    max_train_size = int(df_train_pos.shape[0] / pct_positive)
    
    if not train_size:
        train_size = max_train_size
    elif train_size > max_train_size:
        warnings.warn(f'train_size of {train_size} exceeds the amount of training data available for the given parameters. Resetting train size to {max_train_size}')
        train_size = max_train_size
    else:
        train_size = int(train_size)
        
    num_pos_samples = int(pct_positive * train_size)
    num_neg_samples = train_size - num_pos_samples
    
    df_train = pd.concat([
        df_train_pos.sample(num_pos_samples, random_state=random_state),
        df_train_neg.sample(num_neg_samples)]).sample(
        frac=1, rnadom_state=random_state)
    
    X_train = df_train['comment']
    y_train = df_train['target']
    
    return (X_train, X_test, y_train, y_test)
    
    
    
    

In [6]:
X_train, X_test, y_train, y_test = split_data(df, 0.5, 5000, 10000)

In [8]:
y_train.value_counts(normalize=True)

0    0.5
1    0.5
Name: target, dtype: float64

In [6]:
np.ceil(12.334)

13.0

In [9]:
from sklearn.model_selection import train_test_split
X = df['comment']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y)

rev_id
315199003    0
42332429     1
68812825     0
312843300    0
221002603    0
            ..
56258388     0
374547245    0
556305974    0
142760283    0
312866440    0
Name: target, Length: 86898, dtype: int32

In [11]:
def merge_with_target(df_comments, df_annotations, target_col_name, threshold=0.5):
    target = (df_annotations.groupby('rev_id')[target_col_name].mean() >= threshold).astype(int).rename('target')
    return df_comments[['rev_id', 'comment']].merge(target, left_on='rev_id', right_index=True)

In [12]:
df = merge_with_target(df_comments, df_scores, 'aggression')

In [9]:
df.head()

Unnamed: 0,rev_id,comment,target
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,0
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,0
2,49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",0
3,89320,"Next, maybe you could work on being less cond...",0
4,93890,This page will need disambiguation.,0


In [14]:
kwargs = {'arg1': 656, 'arg2': 'Hi there'}
kwargs.get('arg3', 'Not here')

'Not here'

In [15]:
'arg1' in kwargs

True

In [10]:
df.drop([2, 3]).head()

Unnamed: 0,rev_id,comment,target
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,0
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,0
4,93890,This page will need disambiguation.,0
5,102817,NEWLINE_TOKEN-NEWLINE_TOKENNEWLINE_TOKENImport...,0
6,103624,I removed the following:NEWLINE_TOKENNEWLINE_T...,0


In [19]:
for ix, row in df[df['target'] == 1].sample(10).iterrows():
    print(f'{ix}: {row["comment"]}')
    print()

84003: NEWLINE_TOKENNEWLINE_TOKENI suppose, you probably spent all your money on Star wars books so had no money for condoms. It all makes sense now, your not stupid just ignorantly poor.

82191: NEWLINE_TOKENNEWLINE_TOKEN== See you ==NEWLINE_TOKENNEWLINE_TOKENLousy cunt

28832: NEWLINE_TOKENNEWLINE_TOKEN== A little song ==NEWLINE_TOKENNEWLINE_TOKENFucky fucky fucky fuckNEWLINE_TOKENFucky fucky fucky fuckNEWLINE_TOKENFucky fucky fucky fucky fucky fuckNEWLINE_TOKENFucky fucky fucky fuckNEWLINE_TOKENFucky fucky fucky fuckNEWLINE_TOKENFucky fucky fucky fucky fucky fuckNEWLINE_TOKENFuck Fuck Fuck Fuck Fuck Fuck Fuck Fuck Fuck FuckNEWLINE_TOKENFuck Fuck Fuck Fuck Fuck Fuck Fuck Fuck Fuck FuckNEWLINE_TOKENFucky fucky fucky fuckNEWLINE_TOKENFucky fucky fucky fuckNEWLINE_TOKENFucky fucky fucky fucky fucky fuckNEWLINE_TOKENFucky fucky fucky fuckNEWLINE_TOKENFucky fucky fucky fuckNEWLINE_TOKENFucky fucky fucky fucky fucky fuck

45163: NEWLINE_TOKENNEWLINE_TOKENdo you arrogant bastards have nothi

In [16]:
df_comments.shape[0]

115864