In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import os
import sys, os
sys.path.insert(0, os.path.abspath('..'))
import data_generation.diff_utils
import data_generation.mwdiff.mwdiffs_to_tsv
import numpy as np

In [2]:
# load split data
out_dir = "../../data/figshare"
in_dir = "../../data/annotations/split"
splits = ["train", "dev", "test"]

dfs = []
for split in splits:
    df = pd.read_csv(os.path.join(in_dir, split, 'annotations.tsv'), sep = '\t')
    df['split'] = split
    dfs.append(df)
df = pd.concat(dfs)
df.shape

(1368958, 23)

In [11]:
# rename workers
df_workers = df[['_worker_id']].drop_duplicates()
df_workers['anon_id'] = range(df_workers.shape[0])
df = df.merge(df_workers, how = 'inner', on = '_worker_id')
df.shape

(1368958, 24)

In [13]:
# get set of labeled comments
df_comments = df.drop_duplicates(subset = ['rev_id']).copy()
df_comments['logged_in'] = df_comments['user_id'] != 0
df_comments['year'] = pd.to_datetime(df_comments['rev_timestamp']).apply(lambda x: x.year)

df_comments['diff'] = df_comments['diff'].apply(data_generation.mwdiff.mwdiffs_to_tsv.replace_special_chars)

# fix legacy special token issues
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('TAB', 'TAB_TOKEN'))
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('NEWLINE', 'NEWLINE_TOKEN'))
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('"', '`'))

# apply latest version of clean and filter
df_comments = data_generation.diff_utils.clean_and_filter(df_comments)
# clean and filter drops some comments, so drop associated labels
df = df.merge(df_comments[['rev_id']], how = 'inner', on = 'rev_id' )

In [14]:
# rename some columns
df_comments = df_comments.rename(columns={
                        'clean_diff': 'comment',
                        'rev_timestamp': 'timestamp',
        
                       })
order = ['rev_id', 'comment', 'year', 'logged_in', 'ns', 'sample', 'split']
df_comments = df_comments[order]
df_comments = df_comments.sort_values('rev_id')
df_comments.shape

(115864, 7)

In [15]:
# get set of human labels

df_attack_labels = df[['rev_id', 'anon_id', 'attack',]]

df_attack_labels = df_attack_labels.rename(columns={
                        'anon_id': 'worker_id',
                       })

df_attack_labels = df_attack_labels.sort_values('rev_id')


df_aggression_labels = df[['rev_id', 'anon_id', 'aggression',]]

df_aggression_labels = df_aggression_labels.rename(columns={
                        'anon_id': 'worker_id',
                       })

df_aggression_labels = df_aggression_labels.sort_values('rev_id')

In [16]:
# save dfs
df_comments.to_csv(os.path.join( "../../data/figshare", 'attack_annotated_comments.tsv'), sep = '\t', index = False)
df_comments.to_csv(os.path.join( "../../data/figshare", 'aggression_annotated_comments.tsv'), sep = '\t', index = False)

df_attack_labels.to_csv(os.path.join( "../../data/figshare", 'attack_annotations.tsv'), sep = '\t', index = False)
df_aggression_labels.to_csv(os.path.join( "../../data/figshare", 'aggression_annotations.tsv'), sep = '\t', index = False)

In [17]:
pd.read_csv(os.path.join( "../../data/figshare", 'attack_annotated_comments.tsv'), sep = '\t').shape

(115864, 7)

In [19]:
pd.read_csv(os.path.join( "../../data/figshare", 'attack_annotations.tsv'), sep = '\t').drop_duplicates(subset = 'rev_id').shape

(115864, 3)