In [4]:
import pandas as pd
import numpy as np

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [5]:
test_data = pd.read_csv('a3_train_final.tsv', sep='\t',  names=['Y', 'comment'])
train_data = pd.read_csv('a3_test_final.tsv', sep='\t',  names=['Y', 'comment'])

In [6]:
test_data.head(5)

Unnamed: 0,Y,comment
0,0/-1,It is easier to fool a million people than it...
1,0/0,NATURAL IMMUNITY protected us since evolutio...
2,0/-1,NATURAL IMMUNITY protected us since evolutio...
3,1/1/1/-1,The bigest sideffect of vaccines is fewer dea...
4,1/-1,Unvaccinated people are more likely to become...


In [7]:
# remove all emojis
import re
test_data['comment'] = test_data['comment'].str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)
train_data['comment'] = train_data['comment'].str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)

In [8]:
test_data.head(5)

Unnamed: 0,Y,comment
0,0/-1,It is easier to fool a million people than it...
1,0/0,NATURAL IMMUNITY protected us since evolutio...
2,0/-1,NATURAL IMMUNITY protected us since evolutio...
3,1/1/1/-1,The bigest sideffect of vaccines is fewer dea...
4,1/-1,Unvaccinated people are more likely to become...


In [9]:
def spit_col(df):
    split_df = df['Y'].str.split('/', n=30, expand=True)
    cc = split_df.mode(axis=1).iloc[:,0]
    return cc

In [10]:
cc = spit_col(test_data)

In [11]:
new_df = pd.DataFrame(cc)
df_temp = pd.DataFrame(test_data['comment'])
df2 = new_df.join(df_temp)
df = df2.rename(columns={0: 'Y'})
df

Unnamed: 0,Y,comment
0,-1,It is easier to fool a million people than it...
1,0,NATURAL IMMUNITY protected us since evolutio...
2,-1,NATURAL IMMUNITY protected us since evolutio...
3,1,The bigest sideffect of vaccines is fewer dea...
4,-1,Unvaccinated people are more likely to become...
...,...,...
26192,0,no vaccine
26193,-1,
26194,0,keep your I already know 3 people who have b...
26195,0,"JUST BECAUSE ITS SAFE, DOESNT MEAN IT DOESNT ..."


In [12]:
X_train, X_eval, Y_train, Y_eval = train_test_split(df['comment'], df['Y'], test_size=0.2, random_state=12345)

In [13]:
def train_document_classifier(X, Y):
    pipeline = make_pipeline( TfidfVectorizer(), LinearSVC() )
    pipeline.fit(X, Y)
    return pipeline

In [14]:
clf_comments = train_document_classifier(X_train, Y_train)

In [15]:
bb_acc = accuracy_score(Y_eval, clf_comments.predict(X_eval))
bb_acc

0.7270992366412213

In [16]:
test_data['Y']

0            0/-1
1             0/0
2            0/-1
3        1/1/1/-1
4            1/-1
           ...   
26192         0/0
26193        0/-1
26194         0/0
26195         0/0
26196         0/0
Name: Y, Length: 26197, dtype: object

In [17]:
test_data.head()

Unnamed: 0,Y,comment
0,0/-1,It is easier to fool a million people than it...
1,0/0,NATURAL IMMUNITY protected us since evolutio...
2,0/-1,NATURAL IMMUNITY protected us since evolutio...
3,1/1/1/-1,The bigest sideffect of vaccines is fewer dea...
4,1/-1,Unvaccinated people are more likely to become...


In [18]:
train_data.head()

Unnamed: 0,Y,comment
0,1,I dont know whats in it. As if they know whats...
1,1,"In the September time frame, unvaccinated peop..."
2,0,It is a pity that people are dying from the va...
3,0,"The benefits outweigh the risks. is MARKETING,..."
4,0,Vaccines are saving lives meanwhile romania be...


# ----------------------------------

In [33]:
df2 = pd.DataFrame()

In [34]:
#df2 = test_data['Y'].str.replace("/"," ")
df2 = test_data['Y'].replace({'/':' '}, regex=True)

In [39]:
df2['Y'] = pd.to_numeric(test_data['Y'], errors='coerce')
print(type(df2))
df2 = df2.dropna() 
df2.head()

<class 'pandas.core.series.Series'>


0        0 -1
1         0 0
2        0 -1
3    1 1 1 -1
4        1 -1
Name: Y, dtype: object

In [598]:
df2

0            0 -1
1             0 0
2            0 -1
3        1 1 1 -1
4            1 -1
           ...   
26191         0 0
26192         0 0
26193        0 -1
26194         0 0
26195         0 0
Name: Y, Length: 26196, dtype: object

In [77]:
df3 = pd.DataFrame(test_data)
#df3['Y'] = df3["Y"].str.replace("/"," ")

df3['NUM'] = pd.to_numeric(df3['Y'].str.replace('/', ' '), errors='ignore')
df3['Num'] = pd.to_numeric(df3['Y'], errors='coerce').astype('Int64')

print(type(df2))
df2 = df2.dropna() 


<class 'pandas.core.frame.DataFrame'>


In [69]:
df3

Unnamed: 0,Y,comment,X,Num,NUM
0,0 -1,It is easier to fool a million people than it...,0 -1,,0 -1
1,0 0,NATURAL IMMUNITY protected us since evolutio...,0 0,,0 0
2,0 -1,NATURAL IMMUNITY protected us since evolutio...,0 -1,,0 -1
3,1 1 1 -1,The bigest sideffect of vaccines is fewer dea...,1 1 1 -1,,1 1 1 -1
4,1 -1,Unvaccinated people are more likely to become...,1 -1,,1 -1
...,...,...,...,...,...
26192,0 0,no vaccine,0 0,,0 0
26193,0 -1,,0 -1,,0 -1
26194,0 0,keep your I already know 3 people who have b...,0 0,,0 0
26195,0 0,"JUST BECAUSE ITS SAFE, DOESNT MEAN IT DOESNT ...",0 0,,0 0


In [74]:
print(df3['NUM'].dtypes)
print(df3['Num'].dtypes)
print(df3['X'].dtypes)
print(df3['Y'].dtypes)

object
Int64
object
object


In [82]:
df3['Num'] = pd.to_numeric(df3['NUM'], errors='coerce').astype('Int64')

In [84]:
df3['Num']

0        <NA>
1        <NA>
2        <NA>
3        <NA>
4        <NA>
         ... 
26192    <NA>
26193    <NA>
26194    <NA>
26195    <NA>
26196    <NA>
Name: Num, Length: 26197, dtype: Int64

In [506]:
df2.to_numpy()

In [None]:
lst = []
for i in range(len(df)):
    temp_s = []
    tmp = []
    temp_s = df[i]
    for t in range(len(temp_s)):
        if t != ' ':
            s = int(temp[t])
            tmp.append(s)
        else:
            tmp += ' '
    

# --------------------------------------

In [26]:
#dd = pd.DataFrame(test_data)
dd = test_data['Y'].str.split('/', n=30, expand=True)
dd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,0,-1,,,,,,,,,...,,,,,,,,,,
1,0,0,,,,,,,,,...,,,,,,,,,,
2,0,-1,,,,,,,,,...,,,,,,,,,,
3,1,1,1,-1,,,,,,,...,,,,,,,,,,
4,1,-1,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26192,0,0,,,,,,,,,...,,,,,,,,,,
26193,0,-1,,,,,,,,,...,,,,,,,,,,
26194,0,0,,,,,,,,,...,,,,,,,,,,
26195,0,0,,,,,,,,,...,,,,,,,,,,


In [27]:
dd = dd.to_numpy()
dd 

array([['0', '-1', None, ..., None, None, None],
       ['0', '0', None, ..., None, None, None],
       ['0', '-1', None, ..., None, None, None],
       ...,
       ['0', '0', None, ..., None, None, None],
       ['0', '0', None, ..., None, None, None],
       ['0', '0', None, ..., None, None, None]], dtype=object)

In [28]:
type(dd)     

numpy.ndarray

In [29]:
a =  dd[dd != np.array(None)]

In [None]:
for i in range(len(dd)):
    t = dd[i]
    for j in range(len(t)):
        print(dd[j])
        

['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' '1' '-1' None None None None None None None None None None None
 None None None None None None None None]
['1' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None None None None None None None None None None 

['1' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '0' '-1' '0' None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None N

['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' '1' '-1' None None None None None None None None None None N

['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None N

 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' '1' '-1' None None None None None None None None None None None
 None None None None None None None None]
['1' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None Non

['1' '1' '1' '-1' None None None None None None None None None None None
 None None None None None None None None]
['1' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '0' '-1' '0' None None None None None None None None None None No

 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' '1' '-1' None None None None None None None None None None None
 None None None None None None None None]
['1' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None Non

['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' '1' '-1' None None None None None None None None None None None
 None None None None None None None None]
['1' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None None None None None None None None None None 

['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' '1' '-1' None None None None None None None None None None None
 None None None None None None None None]
['1' '-1' None None None None None None None None None None None None 

['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' '1' '-1' None None None None None None None None None None None
 None None None None None None None None]
['1' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None None None None None None None None None None 

 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '0' '-1' '0' None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '1' None None None None 

['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['1' '1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None N

['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '-1' None None None None None None None None None None None None None
 None None None None None None None None]
['0' '0' None None None None None None None None None None None None N

# -------------------------------------

In [283]:
split_df = test_data['Y'].replace('/', ',')
split_df 

0                                                     0/-1
1                                                      0/0
2                                                     0/-1
3                                                 1/1/1/-1
4                                                     1/-1
                               ...                        
26192                                                  0/0
26193                                                 0/-1
26194                                                  0/0
26195                                                  0/0
26196    0                                             ...
Name: Y, Length: 26197, dtype: object

In [None]:
# NOPE!
new_df = pd.DataFrame()
lst = []
ind = 0

# index get out of range, using the i variable to get to the correct index
for index, columnData in df_arr.iteritems():
    a = 0

    for i in range(len(columnData)):
        t = []
        temp = 0
        print(columnData[i])
        if columnData[i].isdigit() == True:
            temp = int(columnData[i])
        t.append(temp)
    lst += t
    c = c
    ind+=1

In [224]:
nan_array = np.isnan(aa)
not_nan_array = ~ nan_array
array2 = array1[not_nan_array]

print(array2)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [382]:
df_arr = test_data['Y']
df_arr

0                                                     0/-1
1                                                      0/0
2                                                     0/-1
3                                                 1/1/1/-1
4                                                     1/-1
                               ...                        
26192                                                  0/0
26193                                                 0/-1
26194                                                  0/0
26195                                                  0/0
26196    0                                             ...
Name: Y, Length: 26197, dtype: object

In [200]:
# NOPE
tt = []
for index, columnData in df_arr.iteritems():
    #print(columnData)
    temp = columnData
    #print(temp)
    temp = temp.replace('/', ',')
    tt += t


In [284]:
aa = split_df.to_numpy()
aa

array(['0/-1', '0/0', '0/-1', ..., '0/0', '0/0',
       0                                                     0/-1
       1                                                      0/0
       2                                                     0/-1
       3                                                 1/1/1/-1
       4                                                     1/-1
                                      ...
       26192                                                  0/0
       26193                                                 0/-1
       26194                                                  0/0
       26195                                                  0/0
       26196    0            0/-1
       1             0/0
       2         ...
       Name: Y, Length: 26197, dtype: object                     ],
      dtype=object)

In [165]:
#data['Y'] = data['Y'].str.replace('/',',')
#df['column name'] = df['column name'].str.replace('old character','new character')

In [380]:
#df['coloum'] = df['coloum'].replace(['value_1','valu_2'],'new_value')
#df_arr = test_data[['Y']].iloc[:,0].replace('/', ' ')

df['Y'] = df['Y'].replace('/',' ')
df_arr

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [308]:
aa = test_data[['Y']]
aa

Unnamed: 0,Y
0,0/-1
1,0/0
2,0/-1
3,1/1/1/-1
4,1/-1
...,...
26192,0/0
26193,0/-1
26194,0/0
26195,0/0


In [115]:
df = test_data['Y'].replace({'/':' '}, regex=True)
df

0            0 -1
1             0 0
2            0 -1
3        1 1 1 -1
4            1 -1
           ...   
26192         0 0
26193        0 -1
26194         0 0
26195         0 0
26196         0 0
Name: Y, Length: 26197, dtype: object

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [318]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,Y
6493,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


In [441]:
df2 = test_data["Y"].str.replace("[/]"," ")
df2['Y'] = pd.to_numeric(df2[Y])
print(type(df2))
df2 = df2.dropna() 
df2

<class 'pandas.core.series.Series'>


0            0 -1
1             0 0
2            0 -1
3        1 1 1 -1
4            1 -1
           ...   
26191         0 0
26192         0 0
26193        0 -1
26194         0 0
26195         0 0
Name: Y, Length: 26196, dtype: object

In [451]:
df2 = test_data["Y"].str.replace("[/]"," ")
df = df2.drop(df[df[Y].convert_objects(convert_numeric=True).isnull()].index)

AttributeError: 'numpy.ndarray' object has no attribute 'convert_objects'

In [445]:
lst = []
for i in range(len(df2)):
    tt = []
    tt = int(df2[i].to_numeric())

AttributeError: 'str' object has no attribute 'to_numeric'

In [443]:
df = df2.loc[(pd.to_numeric(df2['Y'],errors='coerce'))]

KeyError: 'Y'

In [422]:
lst = []
for i in range(len(df2)):
    tmp = []
    

In [110]:
df = pd.DataFrame(df2)
df = df[['Y']].to_numpy()


In [111]:
lst = []
for i in range(len(df)):
    temp = df[i]
    

In [361]:
y = df.astype(np.float)
y

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = df.astype(np.float)


ValueError: could not convert string to float: '0 -1'

installing statsmodels
```
python -m pip install statsmodels 
```

In [113]:
from statsmodels.stats import inter_rater as irr

In [122]:
irr.aggregate_raters(ddf)

ValueError: object of too small depth for desired array

In [162]:
dats, cats = irr.aggregate_raters(df_arr)

In [163]:
irr.fleiss_kappa(dats, method='fleiss')

  p_rat = (table2.sum(1) - n_rat) / (n_rat * (n_rat - 1.))


nan

In [164]:
giro = np.array(df_arr).transpose()
giro

array([['0/-1', '0/0', '0/-1', ..., '0/0', '0/0', '0/0']], dtype=object)

In [128]:
df['Y'] = test_data['Y'].replace({'/':' '}, regex=True)
#df['X']



In [133]:
ddf = df[['Y']].to_numpy()
ddf

array([0            0 -1
       1             0 0
       2            0 -1
       3        1 1 1 -1
       4            1 -1
                  ...
       26192         0 0
       26193        0 -1
       26194         0 0
       26195         0 0
       26196         0 0
       Name: Y, Length: 26197, dtype: object], dtype=object)

In [2]:
import krippendorff

In [3]:
import krippendorff

arr = [[1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0]]    
res = krippendorff.alpha(arr)
res

0.6805555555555556

### With parameters

In [None]:
params = {'smooth_idf' : 'bool',
         }

In [None]:
parameter_grid = {'max_depth': [1, 2, 3, 4, 5], 
                  'max_features': [1, 2, 3, 4, 5],
                 'random_state':[0, 1, 2, 3, 4, 5],
                  "min_samples_leaf": np.linspace(0.001, 0.03, 5)
                 }

In [None]:
def train_document_classifier(X, Y):
    pipeline = make_pipeline( TfidfVectorizer(), LinearSVC() )
    pipeline.fit(X, Y)
    return pipeline