In [1]:
from sklearn.neural_network import *
from sklearn.model_selection import *
from sklearn.linear_model import LogisticRegression
from sklearn.svm import *
from helper import *
import numpy as np
import time
from bert import *
from data_processing import *
import pandas as pd
from preprocess import *
from sklearn import preprocessing as skp
from tweetToVec import *

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load all the data

In [2]:
NUM_TWEETS = 60000
MAX_LEN = 60

In [3]:
path_to_tweet = '../data/twitter-datasets/'
pos_path = path_to_tweet + 'train_pos_full.txt'
neg_path = path_to_tweet + 'train_neg_full.txt'
test_path = path_to_tweet + 'test_data.txt'

df_train = load_train(pos_path, neg_path, NUM_TWEETS)


## Transform the data

In [4]:
#-----BERT-----
#df_train = bert_tokenize_train(df_train, path_to_tweet + 'out_train_testing.csv', max_len=MAX_LEN)
#add_padding(df_train)
#-----GloVe----
#Un-/comment next line to de-/activate preprocessing
df_train = preprocess(df_train, path_to_tweet + 'out_pre_train_testing.csv')


kf = KFold(n_splits=4, random_state=42, shuffle=True)
cross_val = kf.split(df_train)



In [5]:
#-------GloVe------
df_list = []
for s in df_train['tweet']:
    df_list.append(tweet_to_vec(s))
df_list = skp.scale(df_list)
X = np.array(df_list)
y = df_train['label']

In [15]:
#-------BERT------
"""df_list_bert = []
for token in df_train['input_ids']:
    df_list_bert.append(list(token))
df_list_bert = skp.scale(df_list_bert)
X = np.array(df_list_bert)
y = df_train['label']
"""

"df_list_bert = []\nfor token in df_train['input_ids']:\n    df_list_bert.append(list(token))\ndf_list_bert = skp.scale(df_list_bert)\nX = np.array(df_list_bert)\ny = df_train['label']\n"

## Train Models

In [None]:
#---MLP---
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
sizes = [50,100,150]
best_acc = 0
best_params = []
for alpha in np.logspace(-3, -1, 3):
    for activation in ['tanh']:
        print('alpha = ', alpha)
        for size in sizes:
            print("size =", size)
            clf2 = MLPClassifier(alpha = alpha, solver ='lbfgs', hidden_layer_sizes = (size,size,size), max_iter = 100, random_state = 12)
            for train, test in kf.split(X):
                #crossval
                x_tr = X[train]
                x_te = X[test]
                y_tr = y.iloc[train]
                y_te = y.iloc[test]

                start = time.time()
                clf2.fit(x_tr, y_tr)
                acc = clf2.score(x_te, y_te)
                if acc > best_acc:
                    best_acc = acc
                    best_params = [alpha,activation, size]
            print(best_acc)
    
#0.6430666666666667 [0.039810717055349734, relu, 150] <- BERT
#0.6869333333333333 [0.01, relu, 150] <- GloVe, preprocessing
#0.6622666666666667 [0.001, relu, 150] <- GloVe

In [17]:
best_acc_mlp = best_acc
best_acc_mlp

0.6869333333333333

In [None]:
#----LogReg---
best_acc = 0
best_params = []
for c in np.logspace(-5,0,6):
    print('c = {:f}'.format(c))
    for ratio in (np.arange(11)/10):
    #grid-search
        
        clf3 = LogisticRegression(C=c, penalty = 'elasticnet', n_jobs=-1, solver = 'saga', l1_ratio = ratio)

        accuracies = []
        for train, test in kf.split(X):
            #crossval
            x_tr = X[train]
            x_te = X[test]
            y_tr = y.iloc[train]
            y_te = y.iloc[test]

            clf3.fit(x_tr, y_tr)

            accuracies.append(clf3.score(x_te, y_te))
        acc = sum(accuracies)/len(accuracies)
        if acc > best_acc :
            best_acc = acc 
            best_params = [c, ratio]
    print(best_acc)
    print(best_params)

c = 0.000010
0.6170333333333333
[1e-05, 0.0]
c = 0.000100
0.62715
[0.0001, 0.0]
c = 0.001000
0.6374666666666666
[0.001, 0.0]
c = 0.010000
0.6496
[0.01, 0.0]
c = 0.100000
0.6511
[0.1, 0.6]
c = 1.000000


In [None]:
best_acc_log = best_acc
best_params_log = best_params
best_acc_log, best_params
#(0.6511, [0.1, 0.6]) <- GloVe + preprocess
#(0.6352833333333334, [0.01, 0.7]) <- GloVe
#(0.6178833333333332, [0.0001, 0.0]) <- bert


In [6]:
#SVM
best_acc = 0

for c in np.logspace(-5,-1, 5):
    for penalty in ['l2']:
        for loss in ['hinge', 'squared_hinge']:
            print('C = ', c, 'pen =', penalty, 'loss =', loss)
            clf4 = LinearSVC(C = c, penalty = penalty, dual=True, loss = loss, max_iter=100000, random_state = 42)

            accuracies = []
            for train, test in kf.split(X):
                #Cross-validation
                x_tr = X[train]
                x_te = X[test]
                y_tr = y.iloc[train]
                y_te = y.iloc[test]
                x_tr_l = []
                x_te_l = []

                clf4.fit(x_tr, y_tr)

                accuracies.append(clf4.score(x_te, y_te))
                acc = sum(accuracies)/len(accuracies)
                if acc > best_acc :
                    best_acc = acc 
                    best_params = [c,penalty, loss]
            print(best_acc)

C =  1e-05 pen = l2 loss = hinge
0.6103666666666667
C =  1e-05 pen = l2 loss = squared_hinge
0.6277083333333333
C =  0.0001 pen = l2 loss = hinge
0.6277083333333333
C =  0.0001 pen = l2 loss = squared_hinge
0.63925
C =  0.001 pen = l2 loss = hinge
0.642525
C =  0.001 pen = l2 loss = squared_hinge
0.6475250000000001
C =  0.01 pen = l2 loss = hinge
0.6481
C =  0.01 pen = l2 loss = squared_hinge
0.6481
C =  0.1 pen = l2 loss = hinge
0.6491333333333333
C =  0.1 pen = l2 loss = squared_hinge
0.6491333333333333


In [None]:
best_acc_svm = best_acc
best_params_svm = best_params
best_acc_svm, best_params_svm
#(0.6533333333333333, [0.1, 'l2', 'hinge']) GloVe + preprocessing
#0.64 BERT
#(0.636, [0.1, 'l2', 'hinge']) GloVe

In [None]:
import matplotlib.pyplot as plt
plt.ylim(0.6, 0.7)
plt.bar(['svm', 'mlp', 'log_reg'],[best_acc_svm, best_acc_mlp, best_acc_log], )