In [1]:
from statsmodels.stats import inter_rater as irr
import pandas as pd
import numpy as np

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
test_data = pd.read_csv('a3_train_final.tsv', sep='\t',  names=['Y', 'comment'])
train_data = pd.read_csv('a3_test_final.tsv', sep='\t',  names=['Y', 'comment'])

In [3]:
# remove all emojis
import re
test_data['comment'] = test_data['comment'].str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)
train_data['comment'] = train_data['comment'].str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)

In [4]:
def spit_col(df):
    split_df = df['Y'].str.split('/', n=30, expand=True)
    cc = split_df.mode(axis=1).iloc[:,0]
    return cc

In [5]:
cc = spit_col(test_data)

In [6]:
new_df = pd.DataFrame(cc)
df_temp = pd.DataFrame(test_data['comment'])
df2 = new_df.join(df_temp)
df = df2.rename(columns={0: 'Y'})
df

Unnamed: 0,Y,comment
0,-1,It is easier to fool a million people than it...
1,0,NATURAL IMMUNITY protected us since evolutio...
2,-1,NATURAL IMMUNITY protected us since evolutio...
3,1,The bigest sideffect of vaccines is fewer dea...
4,-1,Unvaccinated people are more likely to become...
...,...,...
26192,0,no vaccine
26193,-1,
26194,0,keep your I already know 3 people who have b...
26195,0,"JUST BECAUSE ITS SAFE, DOESNT MEAN IT DOESNT ..."


In [7]:
df_y = df['Y']
df_y.astype('int')

0       -1
1        0
2       -1
3        1
4       -1
        ..
26192    0
26193   -1
26194    0
26195    0
26196    0
Name: Y, Length: 26197, dtype: int32

In [8]:
def run_sequence(lst) -> list():
    l = []
    for i in lst:
        if i == '1' or i == 1:
            l.append(list([1,0]))
        elif i == '0' or i == 0:
            l.append(list([1,0]))
        elif i == '-1' or i == -1:
            l.append(list([1,1]))
    return l  

def sort_list(lst) -> list():
    yy = run_sequence(list(lst))
    from operator import itemgetter
    bb = sorted(yy, key=itemgetter(1), reverse=False)
    return bb

def get_krippendorff(lst) -> float:
    l = sort_list(lst)
    import krippendorff
    res = krippendorff.alpha(l)
    return res

In [10]:
from collections import Counter
#Import train data
df = pd.read_csv('a3_train_final.tsv', sep='\t', names=['Y','Comment'])

#Review documentation of anotations
#Counter(df['Y'])

## Assign the notations to either pro = '1' or anti = '0'
## If there is not 2 anotations we will exclude these beacuse these has not been going thrpugh the same control as the rest
## Additioanlly we will only mark in each ot the classes 1 or 0 if there are only anotations in one direction due to inconsistency of the reporting of anotations
## If a comment has been anotated e.g ('0/1/0') that was the case 21 times we will label this as -1 
proList = ['1/1','1/1/1/1','1/1/1','1/1/1/1/1','1/1/1/1/1/1','1/1/1/1/1/1/1/1/1']
antiList = ['0/0','0/0/0','0/0/0/0','0/0/0/0/0','0/0/0/0/0/0/0/0/0','0/0/0/0/0/0','0/0/0/0/0/0/0','0/0/0/0/0/0/0/0','0/0/0/0/0/0/0/0/0/0/0/0/0/0/0/0','0/0/0/0/0/0/0/0/0/0/0/0/0', '0/0/0/0/0/0/0/0/0/0/0/0/0/0','0/0/0/0/0/0/0/0/0/0/0/0','0/0/0/0/0/0/0/0/0/0/0/0/0/0/0/0/0/0/0/0/0/0/0','0/0/0/0/0/0/0/0/0/0','0/0/0/0/0/0/0/0/0/0/0','0/0/0/0/0/0/0/0/0/0/0/0/0/0/0/0/0','0/0/0/0/0/0/0/0/0/0/0/0/0/0/0']

df['Y'] = np.where((df.Y.isin(proList)), 1,
          np.where((df.Y.isin(antiList)),0 , -1))


print(Counter(df['Y']))

train = df[df['Y'] != '-1']
train

Counter({1: 10791, 0: 10491, -1: 4915})


Unnamed: 0,Y,Comment
0,-1,It is easier to fool a million people than it...
1,0,NATURAL IMMUNITY protected us since evolutio...
2,-1,NATURAL IMMUNITY protected us since evolutio...
3,-1,The bigest sideffect of vaccines is fewer dea...
4,-1,Unvaccinated people are more likely to become...
...,...,...
26192,0,🙏no vaccine
26193,-1,🚩🚩🚩🚩🚩
26194,0,🤣 keep your 💩 I already know 3 people who have...
26195,0,"🤣🤣🤣 ""JUST BECAUSE IT'S SAFE, DOESN'T MEAN IT D..."


In [None]:
#Import train data
df = pd.read_csv('a3_train_final.tsv', sep='\t', names=['Y','Comment'])

df['Y'] = np.where((df.Y.isin(proList)), 1,
          np.where((df.Y.isin(antiList)),0 , -1))

In [11]:
ddl = list(train.Y)

In [12]:
alpha2 = get_krippendorff(ddl)
print(alpha2)

0.6840387109776054


In [13]:
alpha = get_krippendorff(list(df_y))
print(alpha)

0.8318907811008063
