In [None]:
import pandas as pd

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
test_data = pd.read_csv('a3_train_final.tsv', sep='\t',  names=['Y', 'comment'])
train_data = pd.read_csv('a3_test_final.tsv', sep='\t',  names=['Y', 'comment'])

In [None]:
# SOFT - Assigns the final label of each comment based on the majority (mode)
def soft_labelling(df):
    """
    Split the multiple annotated labels into columns and select the most occuring label
    Returns new dataframe with most common label
    """
    split_labels = df['Y'].str.split('/', n=30, expand=True)
    majority_label = split_labels.mode(axis=1).iloc[:,0]
    maj_label_df = pd.DataFrame(majority_label).join(df['Comment']).rename(columns={0: 'Y'})
    label_c = Counter(maj_label_df['Y'])
    return maj_label_df, label_c

train_soft, label_count_soft = soft_labelling(train)
print(label_count_soft)
train_soft.head(5)

In [None]:
test_data.head(5)

Unnamed: 0,Y,comment
0,0/-1,It is easier to fool a million people than it...
1,0/0,NATURAL IMMUNITY protected us since evolutio...
2,0/-1,NATURAL IMMUNITY protected us since evolutio...
3,1/1/1/-1,The bigest sideffect of vaccines is fewer dea...
4,1/-1,Unvaccinated people are more likely to become...


In [None]:
# remove all emojis
import re
test_data['comment'] = test_data['comment'].str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)
train_data['comment'] = train_data['comment'].str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [None]:
test_data.head(5)

Unnamed: 0,Y,comment
0,0/-1,It is easier to fool a million people than it...
1,0/0,NATURAL IMMUNITY protected us since evolutio...
2,0/-1,NATURAL IMMUNITY protected us since evolutio...
3,1/1/1/-1,The bigest sideffect of vaccines is fewer dea...
4,1/-1,Unvaccinated people are more likely to become...


looks for the most common

In [None]:
def split_label_col(df):
    """
    Split the multiple annotated labels into columns and select the most occuring label
    Returns new dataframe with most common label
    """
    split_labels = df['Y'].str.split('/', n=30, expand=True)
    majority_label = split_labels.mode(axis=1).iloc[:,0]
    maj_label_df = pd.DataFrame(majority_label).join(df['comment']).rename(columns={0: 'Y'})
    return maj_label_df

In [None]:
cc = split_label_col(test_data)
cc

Unnamed: 0,Y,comment
0,-1,It is easier to fool a million people than it...
1,0,NATURAL IMMUNITY protected us since evolutio...
2,-1,NATURAL IMMUNITY protected us since evolutio...
3,1,The bigest sideffect of vaccines is fewer dea...
4,-1,Unvaccinated people are more likely to become...
...,...,...
26192,0,no vaccine
26193,-1,
26194,0,keep your I already know 3 people who have b...
26195,0,"JUST BECAUSE ITS SAFE, DOESNT MEAN IT DOESNT ..."


In [None]:
new_df = pd.DataFrame(cc)
df_temp = pd.DataFrame(test_data['comment'])
df2 = new_df.join(df_temp)
df = df2.rename(columns={0: 'Y'})
df

Unnamed: 0,Y,comment
0,-1,It is easier to fool a million people than it...
1,0,NATURAL IMMUNITY protected us since evolutio...
2,-1,NATURAL IMMUNITY protected us since evolutio...
3,1,The bigest sideffect of vaccines is fewer dea...
4,-1,Unvaccinated people are more likely to become...
...,...,...
26192,0,no vaccine
26193,-1,
26194,0,keep your I already know 3 people who have b...
26195,0,"JUST BECAUSE ITS SAFE, DOESNT MEAN IT DOESNT ..."


In [None]:
X_train, X_eval, Y_train, Y_eval = train_test_split(df['comment'], df['Y'], test_size=0.2, random_state=12345)

In [None]:
def train_document_classifier(X, Y):
    pipeline = make_pipeline( TfidfVectorizer(), LinearSVC() )
    pipeline.fit(X, Y)
    return pipeline

In [None]:
clf_comments = train_document_classifier(X_train, Y_train)

In [None]:
bb_acc = accuracy_score(Y_eval, clf_comments.predict(X_eval))
bb_acc

0.7270992366412213

### With parameters

In [None]:
params = {'smooth_idf' : 'bool',
         }

In [None]:
parameter_grid = {'max_depth': [1, 2, 3, 4, 5], 
                  'max_features': [1, 2, 3, 4, 5],
                 'random_state':[0, 1, 2, 3, 4, 5],
                  "min_samples_leaf": np.linspace(0.001, 0.03, 5)
                 }

In [None]:
def train_document_classifier(X, Y):
    pipeline = make_pipeline( TfidfVectorizer(), LinearSVC() )
    pipeline.fit(X, Y)
    return pipeline