In [5]:
import numpy as np
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.semi_supervised import SelfTrainingClassifier
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression

## Load Data

In [6]:
df = pd.read_csv("processed_train.csv")

In [7]:
df

Unnamed: 0.1,Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,0,86426,"['ask', 'native', 'american', 'take']",OFF,UNT,
1,1,90194,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF,TIN,IND
2,2,16820,"['amazon', 'investigating', 'chinese', 'employ...",NOT,,
3,3,62688,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF,UNT,
4,4,43605,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT,,
...,...,...,...,...,...,...
13235,13235,95338,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF,TIN,IND
13236,13236,67210,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT,,
13237,13237,82921,"['report', 'garbage', 'dont', 'give', 'crap']",OFF,TIN,OTH
13238,13238,27429,['pussy'],OFF,UNT,


## Removing Redundant Axes

In [8]:
df = df.drop(['Unnamed: 0', 'subtask_b', 'subtask_c', 'id'], axis=1)

In [9]:
df

Unnamed: 0,tweet,subtask_a
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


## Renaming Columns

In [10]:
df = df.rename(columns={'subtask_a': 'Offensive'})

In [11]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


## Converting Offensive to Numerical Value

In [12]:
def off(cls):
    if cls =='OFF':
        return 1
    elif cls == 'NOT':
        return 0

In [13]:
df["Offensive"] = df["Offensive"].apply(off)

In [14]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",1
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",1
2,"['amazon', 'investigating', 'chinese', 'employ...",0
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",1
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",0
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",1
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",0
13237,"['report', 'garbage', 'dont', 'give', 'crap']",1
13238,['pussy'],1


## Adding Unlabelled Data

In [59]:
df_unlab = pd.read_csv("processed_unlab.csv").drop(['Unnamed: 0', 'id'], axis=1)

In [60]:
df_unlab

Unnamed: 0,tweet
0,"['q', 'wheres', 'server', 'dump', 'nike', 'dec..."
1,"['constitution', 'day', 'revered', 'conservati..."
2,"['foxnews', 'nra', 'maga', 'potus', 'trump', '..."
3,"['watching', 'boomer', 'getting', 'news', 'sti..."
4,"['pasaran', 'unity', 'demo', 'oppose', 'farrig..."
...,...
1308,"['stop', 'etchecopar', 'fuck', '🖕', '🖕', '🖕', ..."
1309,"['antifa', 'mentally', 'unstable', 'cowards', ..."
1310,"['browning', 'looked', 'like', 'dog', 'shit', ..."
1311,"['two', 'taste', 'like', 'ass', 'url']"


In [61]:
values = [-1 for i in range(len(df_unlab))]

y_unlab = pd.DataFrame({'Offensive':values})

In [62]:
y_unlab

Unnamed: 0,Offensive
0,-1
1,-1
2,-1
3,-1
4,-1
...,...
1308,-1
1309,-1
1310,-1
1311,-1


In [63]:
df_unlab

Unnamed: 0,tweet
0,"['q', 'wheres', 'server', 'dump', 'nike', 'dec..."
1,"['constitution', 'day', 'revered', 'conservati..."
2,"['foxnews', 'nra', 'maga', 'potus', 'trump', '..."
3,"['watching', 'boomer', 'getting', 'news', 'sti..."
4,"['pasaran', 'unity', 'demo', 'oppose', 'farrig..."
...,...
1308,"['stop', 'etchecopar', 'fuck', '🖕', '🖕', '🖕', ..."
1309,"['antifa', 'mentally', 'unstable', 'cowards', ..."
1310,"['browning', 'looked', 'like', 'dog', 'shit', ..."
1311,"['two', 'taste', 'like', 'ass', 'url']"


In [64]:
df['tweet']

0                    ['ask', 'native', 'american', 'take']
1        ['go', 'home', '’', 'drunk', 'maga', 'trump', ...
2        ['amazon', 'investigating', 'chinese', 'employ...
3        ['someone', 'shouldve', 'taken', 'piece', 'shi...
4        ['obama', 'wanted', 'liberal', 'amp', 'illegal...
                               ...                        
13235    ['sometimes', 'get', 'strong', 'vibe', 'people...
13236    ['benidorm', '✅', 'creamfields', '✅', 'maga', ...
13237        ['report', 'garbage', 'dont', 'give', 'crap']
13238                                            ['pussy']
13239    ['spanishrevenge', 'v', 'justice', 'human', 'r...
Name: tweet, Length: 13240, dtype: object

## Splitting into train and test

In [83]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['Offensive'], stratify=df['Offensive'], shuffle=0)

In [84]:
X_train = X_train.reset_index().drop(['index'], axis=1)
X_train

Unnamed: 0,tweet
0,"['please', 'stop', 'nice', 'lying', 'obstructi..."
1,"['sure', 'bout', 'antifa', 'notion', 'open', '..."
2,"['prison', 'time', 'youre', 'going', 'get', 'm..."
3,"['decent', 'anywhere', 'people', 'respect', 'i..."
4,"['best', 'part', 'read', 'exchange', 'thread',..."
...,...
9925,"['opportunist', 'max', 'evil', 'lier', 'also',..."
9926,"['lt', 'help', 'call', 'father', 'let', 'know'..."
9927,"['suicide', 'mission', 'dont', 'put', 'bench',..."
9928,"['traitor', 'think']"


In [85]:
X_train_unlab = X_train.append(df_unlab).reset_index().drop(['index'], axis=1)

In [86]:
X_train_unlab

Unnamed: 0,tweet
0,"['please', 'stop', 'nice', 'lying', 'obstructi..."
1,"['sure', 'bout', 'antifa', 'notion', 'open', '..."
2,"['prison', 'time', 'youre', 'going', 'get', 'm..."
3,"['decent', 'anywhere', 'people', 'respect', 'i..."
4,"['best', 'part', 'read', 'exchange', 'thread',..."
...,...
11238,"['stop', 'etchecopar', 'fuck', '🖕', '🖕', '🖕', ..."
11239,"['antifa', 'mentally', 'unstable', 'cowards', ..."
11240,"['browning', 'looked', 'like', 'dog', 'shit', ..."
11241,"['two', 'taste', 'like', 'ass', 'url']"


In [87]:
X_train = X_train['tweet']

In [88]:
X_train

0       ['please', 'stop', 'nice', 'lying', 'obstructi...
1       ['sure', 'bout', 'antifa', 'notion', 'open', '...
2       ['prison', 'time', 'youre', 'going', 'get', 'm...
3       ['decent', 'anywhere', 'people', 'respect', 'i...
4       ['best', 'part', 'read', 'exchange', 'thread',...
                              ...                        
9925    ['opportunist', 'max', 'evil', 'lier', 'also',...
9926    ['lt', 'help', 'call', 'father', 'let', 'know'...
9927    ['suicide', 'mission', 'dont', 'put', 'bench',...
9928                                 ['traitor', 'think']
9929    ['there', 'murder', 'gun', 'death', 'state', '...
Name: tweet, Length: 9930, dtype: object

In [89]:
X_train_unlab = X_train_unlab['tweet']
X_train_unlab

0        ['please', 'stop', 'nice', 'lying', 'obstructi...
1        ['sure', 'bout', 'antifa', 'notion', 'open', '...
2        ['prison', 'time', 'youre', 'going', 'get', 'm...
3        ['decent', 'anywhere', 'people', 'respect', 'i...
4        ['best', 'part', 'read', 'exchange', 'thread',...
                               ...                        
11238    ['stop', 'etchecopar', 'fuck', '🖕', '🖕', '🖕', ...
11239    ['antifa', 'mentally', 'unstable', 'cowards', ...
11240    ['browning', 'looked', 'like', 'dog', 'shit', ...
11241               ['two', 'taste', 'like', 'ass', 'url']
11242    ['despicable', 'dems', 'lie', 'rifles', 'dem',...
Name: tweet, Length: 11243, dtype: object

In [90]:
X_test = X_test.reset_index().drop(['index'], axis=1)
X_test

Unnamed: 0,tweet
0,"['time', 'political', 'correctness', 'lie', 'e..."
1,"['speak', 'trump', 'family', 'become', 'punchi..."
2,"['showing', 'way', 'others', 'great', 'leader'..."
3,"['far', 'kind', '😍', '😊', '🤗']"
4,"['url', 'isnt', 'peter', 'strzok', 'lisa', 'pa..."
...,...
3305,"['reason', '’', 'wife', 'as']"
3306,"['idk', 'think', 'herbut', 'eye', 'like', 'rsh..."
3307,"['dier', 'imagine', 'durham', 'coming', 'even'..."
3308,"['uh', 'signature', 'record', 'genius', '🚂', '..."


In [91]:
X_test = X_test['tweet']
X_test

0       ['time', 'political', 'correctness', 'lie', 'e...
1       ['speak', 'trump', 'family', 'become', 'punchi...
2       ['showing', 'way', 'others', 'great', 'leader'...
3                          ['far', 'kind', '😍', '😊', '🤗']
4       ['url', 'isnt', 'peter', 'strzok', 'lisa', 'pa...
                              ...                        
3305                        ['reason', '’', 'wife', 'as']
3306    ['idk', 'think', 'herbut', 'eye', 'like', 'rsh...
3307    ['dier', 'imagine', 'durham', 'coming', 'even'...
3308    ['uh', 'signature', 'record', 'genius', '🚂', '...
3309    ['withdraw', 'trump', 'whole', 'list', 'conser...
Name: tweet, Length: 3310, dtype: object

In [92]:
y_train = y_train.reset_index().drop(['index'], axis=1)
y_train

Unnamed: 0,Offensive
0,0
1,0
2,1
3,0
4,0
...,...
9925,1
9926,0
9927,0
9928,1


In [93]:
y_test = y_test.reset_index().drop(['index'], axis=1)
y_test

Unnamed: 0,Offensive
0,0
1,0
2,0
3,0
4,1
...,...
3305,1
3306,1
3307,0
3308,0


In [94]:
y_train_unlab = y_train.append(y_unlab)
y_train_unlab

Unnamed: 0,Offensive
0,0
1,0
2,1
3,0
4,0
...,...
1308,-1
1309,-1
1310,-1
1311,-1


In [95]:
y_train_unlab = y_train_unlab.reset_index().drop(['index'], axis = 1)
y_train_unlab

Unnamed: 0,Offensive
0,0
1,0
2,1
3,0
4,0
...,...
11238,-1
11239,-1
11240,-1
11241,-1


### TF-IDF

In [96]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [97]:
vect_gram = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range = (1, 3), max_df=5)

### Testing on Supervised Classifiers to get Hyperparameters

In [98]:
X_train_gram = vect_gram.fit_transform(X_train)

In [99]:
X_train_gram

<9930x154235 sparse matrix of type '<class 'numpy.float64'>'
	with 170504 stored elements in Compressed Sparse Row format>

In [100]:
vect_gram.get_feature_names()

['aa',
 'aa mr',
 'aa mr bean',
 'aa sary',
 'aa sary liberal',
 'aaa',
 'aaa aaay',
 'aaa aaay eer',
 'aaa dibales',
 'aaa dibales dong',
 'aaa exciting',
 'aaa exciting majority',
 'aaah',
 'aaah thank',
 'aaah thank sm',
 'aaahh',
 'aaahh scribe',
 'aaahh scribe run',
 'aaay',
 'aaay eer',
 'aaay eer url',
 'aaron',
 'aaron bank',
 'aaron bank amp',
 'aaron barely',
 'aaron barely year',
 'aaron hernandez',
 'aaron hernandez know',
 'aaron rodgers',
 'aaron rodgers win',
 'aarp',
 'aarp stop',
 'aarp stop biased',
 'aasertions',
 'aasertions voting',
 'aasertions voting amp',
 'ab confirm',
 'ab confirm judge',
 'ab dumb',
 'ab dumb shane',
 'ab literally',
 'ab literally offense',
 'ab lot',
 'ab lot defined',
 'ab player',
 'ab player today',
 'ab winning',
 'ab winning rugby',
 'aba',
 'aba leadership',
 'aba leadership champion',
 'aback',
 'abandon',
 'abandon brexit',
 'abandon brexit liberal',
 'abandon conservative',
 'abandon conservative want',
 'abbott',
 'abbott conserva

In [101]:
clf_gram = LogisticRegression()

In [102]:
scores_gram = dict()

In [103]:
C = list(np.arange(0, 1, 0.05))

C = [float(i) for i in C]

C = C[1:]

C

[0.05,
 0.1,
 0.15000000000000002,
 0.2,
 0.25,
 0.30000000000000004,
 0.35000000000000003,
 0.4,
 0.45,
 0.5,
 0.55,
 0.6000000000000001,
 0.65,
 0.7000000000000001,
 0.75,
 0.8,
 0.8500000000000001,
 0.9,
 0.9500000000000001]

In [104]:
for i in C:
    clf_gram = LogisticRegression(C= i, max_iter=1000)
    clf_gram.fit(X_train_gram, y_train)
    scores_gram[i] = clf_gram.score(vect_gram.transform(X_test), y_test)
    
scores_gram

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


{0.05: 0.6676737160120846,
 0.1: 0.6676737160120846,
 0.15000000000000002: 0.6676737160120846,
 0.2: 0.6676737160120846,
 0.25: 0.6676737160120846,
 0.30000000000000004: 0.6676737160120846,
 0.35000000000000003: 0.6676737160120846,
 0.4: 0.66797583081571,
 0.45: 0.66797583081571,
 0.5: 0.66797583081571,
 0.55: 0.66797583081571,
 0.6000000000000001: 0.6682779456193354,
 0.65: 0.6682779456193354,
 0.7000000000000001: 0.6685800604229607,
 0.75: 0.6688821752265861,
 0.8: 0.6688821752265861,
 0.8500000000000001: 0.6691842900302115,
 0.9: 0.6694864048338368,
 0.9500000000000001: 0.6697885196374622}

In [105]:
best_C = max(scores_gram, key=scores_gram.get)

print(best_C)

0.9500000000000001


In [106]:
clf = LogisticRegression(C=best_C)

### SSL Part

In [107]:
ssl1 = SelfTrainingClassifier(clf)

In [108]:
X_train_unlab_vect = vect_gram.fit_transform(X_train_unlab)

In [109]:
X_train_unlab_vect

<11243x173007 sparse matrix of type '<class 'numpy.float64'>'
	with 201270 stored elements in Compressed Sparse Row format>

In [110]:
ssl1.fit(X_train_unlab_vect, y_train_unlab)

  return f(*args, **kwargs)


SelfTrainingClassifier(base_estimator=LogisticRegression(C=0.9500000000000001))

In [111]:
ssl1.score(vect_gram.transform(X_test), y_test)

0.6685800604229607

### CBOW

In [36]:
#Sentence vectoriser - Average of sentence for each vector
def sent_vect(sent, model):
    sent = eval(sent)
    vec = np.zeros(32)
    num = 0
    for w in sent:
        try:
            temp_vec = model.wv.get_vector(w)
            #print(temp_vec)
            vec = np.add(vec, temp_vec)
            num += 1
        except:
            pass
        
    return vec / np.sqrt(vec.dot(vec))

In [37]:
X_train_cbow = pd.DataFrame(X_train.apply(sent_vect, model=cbow).sort_index())
X_train_cbow

Unnamed: 0,tweet
0,"[0.10371455167512132, -0.0026690812901859584, ..."
1,"[0.07306809735589292, 0.10742395934446738, 0.1..."
2,"[0.12152500412207404, 0.03987473932714874, 0.1..."
3,"[0.15234790567554005, 0.07549362942942803, 0.1..."
4,"[0.17659919052198297, 0.07970519122554563, 0.0..."
...,...
9925,"[0.1268052961219268, -0.15723430540881764, 0.1..."
9926,"[0.11029270031188039, 0.08434713848718038, 0.1..."
9927,"[0.07075947084083557, 0.020389900997694718, 0...."
9928,"[0.09825704417810942, -0.09855605385106399, 0...."


In [38]:
X_train_cbow_unlab = pd.DataFrame(X_train_unlab.apply(sent_vect, model=cbow).sort_index())
X_train_cbow_unlab

Unnamed: 0,tweet
0,"[0.10371455167512132, -0.0026690812901859584, ..."
1,"[0.07306809735589292, 0.10742395934446738, 0.1..."
2,"[0.12152500412207404, 0.03987473932714874, 0.1..."
3,"[0.15234790567554005, 0.07549362942942803, 0.1..."
4,"[0.17659919052198297, 0.07970519122554563, 0.0..."
...,...
11238,"[0.08074036922904741, 0.023623614673037703, 0...."
11239,"[0.025660048931230296, -0.019617122927809535, ..."
11240,"[0.2108769863318312, 0.09056786766907235, 0.17..."
11241,"[0.07118914638028505, 0.01962914238976545, 0.2..."


In [39]:
def convert_df(df):
    df_new = pd.DataFrame(columns=list(range(32)))
    for ind in df.index:
        #print(df.iloc[ind][0])
        df_test = pd.DataFrame([[k for k in df.iloc[ind][0]]], columns=list(range(32)))
        #print(df_test)
        df_new = df_new.append(df_test)
            
    return df_new

In [40]:
X_train_cbow = convert_df(X_train_cbow)
X_train_cbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.103715,-0.002669,0.153950,0.237088,0.150675,0.039473,-0.195056,-0.123852,-0.135551,-0.243535,...,-0.040841,0.295382,-0.165185,-0.095019,-0.135421,0.270145,-0.136805,-0.360417,-0.013511,-0.042050
0,0.073068,0.107424,0.124951,0.210843,0.136943,0.112223,-0.216901,-0.158670,-0.104263,-0.230354,...,-0.075750,0.237339,-0.120083,-0.047526,-0.119075,0.238800,-0.165567,-0.363928,0.031739,-0.097591
0,0.121525,0.039875,0.136009,0.191058,0.122848,0.136994,-0.186814,-0.126776,-0.084429,-0.229106,...,-0.112223,0.311797,-0.094857,-0.041932,-0.116575,0.279989,-0.198768,-0.341706,-0.029827,-0.073104
0,0.152348,0.075494,0.128314,0.248046,0.166745,0.046800,-0.235776,-0.084590,-0.143277,-0.183683,...,-0.076028,0.267752,-0.061788,-0.067676,-0.124432,0.360069,-0.151870,-0.352549,0.040473,-0.085711
0,0.176599,0.079705,0.089254,0.251721,0.144600,0.004873,-0.232831,-0.091239,-0.153556,-0.126065,...,-0.099872,0.273970,-0.048746,-0.073946,-0.160822,0.387989,-0.157689,-0.365438,0.027586,-0.076929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.126805,-0.157234,0.147893,0.072082,0.165811,0.132364,-0.074539,-0.142090,-0.020093,-0.245621,...,-0.077936,0.364318,-0.137421,-0.094633,-0.068587,0.311153,-0.099652,-0.256199,-0.058486,-0.065221
0,0.110293,0.084347,0.113860,0.235922,0.140378,0.093828,-0.228333,-0.119690,-0.112826,-0.212017,...,-0.085178,0.262552,-0.084069,-0.034056,-0.119824,0.280386,-0.196844,-0.355833,0.008985,-0.087785
0,0.070759,0.020390,0.284672,0.151575,0.171471,0.144294,-0.215269,-0.037508,-0.096271,-0.269813,...,-0.117576,0.196936,-0.111194,0.060687,-0.249338,0.211172,-0.127526,-0.284816,0.046580,-0.081126
0,0.098257,-0.098556,0.137465,0.132955,0.144681,0.106873,-0.107304,-0.148732,-0.064684,-0.268796,...,-0.098630,0.335183,-0.160518,-0.079548,-0.097329,0.254282,-0.119071,-0.334934,-0.048336,-0.069200


In [41]:
X_train_cbow = X_train_cbow.reset_index().drop(['index'], axis=1).fillna(0)
X_train_cbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.103715,-0.002669,0.153950,0.237088,0.150675,0.039473,-0.195056,-0.123852,-0.135551,-0.243535,...,-0.040841,0.295382,-0.165185,-0.095019,-0.135421,0.270145,-0.136805,-0.360417,-0.013511,-0.042050
1,0.073068,0.107424,0.124951,0.210843,0.136943,0.112223,-0.216901,-0.158670,-0.104263,-0.230354,...,-0.075750,0.237339,-0.120083,-0.047526,-0.119075,0.238800,-0.165567,-0.363928,0.031739,-0.097591
2,0.121525,0.039875,0.136009,0.191058,0.122848,0.136994,-0.186814,-0.126776,-0.084429,-0.229106,...,-0.112223,0.311797,-0.094857,-0.041932,-0.116575,0.279989,-0.198768,-0.341706,-0.029827,-0.073104
3,0.152348,0.075494,0.128314,0.248046,0.166745,0.046800,-0.235776,-0.084590,-0.143277,-0.183683,...,-0.076028,0.267752,-0.061788,-0.067676,-0.124432,0.360069,-0.151870,-0.352549,0.040473,-0.085711
4,0.176599,0.079705,0.089254,0.251721,0.144600,0.004873,-0.232831,-0.091239,-0.153556,-0.126065,...,-0.099872,0.273970,-0.048746,-0.073946,-0.160822,0.387989,-0.157689,-0.365438,0.027586,-0.076929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9925,0.126805,-0.157234,0.147893,0.072082,0.165811,0.132364,-0.074539,-0.142090,-0.020093,-0.245621,...,-0.077936,0.364318,-0.137421,-0.094633,-0.068587,0.311153,-0.099652,-0.256199,-0.058486,-0.065221
9926,0.110293,0.084347,0.113860,0.235922,0.140378,0.093828,-0.228333,-0.119690,-0.112826,-0.212017,...,-0.085178,0.262552,-0.084069,-0.034056,-0.119824,0.280386,-0.196844,-0.355833,0.008985,-0.087785
9927,0.070759,0.020390,0.284672,0.151575,0.171471,0.144294,-0.215269,-0.037508,-0.096271,-0.269813,...,-0.117576,0.196936,-0.111194,0.060687,-0.249338,0.211172,-0.127526,-0.284816,0.046580,-0.081126
9928,0.098257,-0.098556,0.137465,0.132955,0.144681,0.106873,-0.107304,-0.148732,-0.064684,-0.268796,...,-0.098630,0.335183,-0.160518,-0.079548,-0.097329,0.254282,-0.119071,-0.334934,-0.048336,-0.069200


In [42]:
X_train_cbow_unlab = convert_df(X_train_cbow_unlab)
X_train_cbow_unlab

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.103715,-0.002669,0.153950,0.237088,0.150675,0.039473,-0.195056,-0.123852,-0.135551,-0.243535,...,-0.040841,0.295382,-0.165185,-0.095019,-0.135421,0.270145,-0.136805,-0.360417,-0.013511,-0.042050
0,0.073068,0.107424,0.124951,0.210843,0.136943,0.112223,-0.216901,-0.158670,-0.104263,-0.230354,...,-0.075750,0.237339,-0.120083,-0.047526,-0.119075,0.238800,-0.165567,-0.363928,0.031739,-0.097591
0,0.121525,0.039875,0.136009,0.191058,0.122848,0.136994,-0.186814,-0.126776,-0.084429,-0.229106,...,-0.112223,0.311797,-0.094857,-0.041932,-0.116575,0.279989,-0.198768,-0.341706,-0.029827,-0.073104
0,0.152348,0.075494,0.128314,0.248046,0.166745,0.046800,-0.235776,-0.084590,-0.143277,-0.183683,...,-0.076028,0.267752,-0.061788,-0.067676,-0.124432,0.360069,-0.151870,-0.352549,0.040473,-0.085711
0,0.176599,0.079705,0.089254,0.251721,0.144600,0.004873,-0.232831,-0.091239,-0.153556,-0.126065,...,-0.099872,0.273970,-0.048746,-0.073946,-0.160822,0.387989,-0.157689,-0.365438,0.027586,-0.076929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.080740,0.023624,0.186916,0.227863,0.151424,0.089884,-0.221890,-0.091094,-0.130406,-0.261291,...,-0.034893,0.219204,-0.137004,-0.047590,-0.136549,0.249172,-0.145938,-0.336659,0.030415,-0.061070
0,0.025660,-0.019617,0.212872,0.166722,0.130383,0.160049,-0.213654,-0.129406,-0.068366,-0.302671,...,-0.052571,0.214512,-0.140285,0.011502,-0.146860,0.179412,-0.173123,-0.297320,-0.007406,-0.066752
0,0.210877,0.090568,0.175094,0.275844,0.038677,0.020650,-0.116821,-0.125229,-0.215237,-0.165218,...,-0.071957,0.302938,-0.141455,-0.171995,-0.177120,0.283961,-0.064634,-0.291319,-0.018566,-0.020657
0,0.071189,0.019629,0.213370,0.208463,0.160716,0.121284,-0.204073,-0.100215,-0.106609,-0.293082,...,-0.062641,0.222766,-0.168441,-0.027969,-0.157000,0.203905,-0.138687,-0.313509,0.016646,-0.060254


In [43]:
X_train_cbow_unlab = X_train_cbow_unlab.reset_index().drop(['index'], axis=1).fillna(0)
X_train_cbow_unlab

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.103715,-0.002669,0.153950,0.237088,0.150675,0.039473,-0.195056,-0.123852,-0.135551,-0.243535,...,-0.040841,0.295382,-0.165185,-0.095019,-0.135421,0.270145,-0.136805,-0.360417,-0.013511,-0.042050
1,0.073068,0.107424,0.124951,0.210843,0.136943,0.112223,-0.216901,-0.158670,-0.104263,-0.230354,...,-0.075750,0.237339,-0.120083,-0.047526,-0.119075,0.238800,-0.165567,-0.363928,0.031739,-0.097591
2,0.121525,0.039875,0.136009,0.191058,0.122848,0.136994,-0.186814,-0.126776,-0.084429,-0.229106,...,-0.112223,0.311797,-0.094857,-0.041932,-0.116575,0.279989,-0.198768,-0.341706,-0.029827,-0.073104
3,0.152348,0.075494,0.128314,0.248046,0.166745,0.046800,-0.235776,-0.084590,-0.143277,-0.183683,...,-0.076028,0.267752,-0.061788,-0.067676,-0.124432,0.360069,-0.151870,-0.352549,0.040473,-0.085711
4,0.176599,0.079705,0.089254,0.251721,0.144600,0.004873,-0.232831,-0.091239,-0.153556,-0.126065,...,-0.099872,0.273970,-0.048746,-0.073946,-0.160822,0.387989,-0.157689,-0.365438,0.027586,-0.076929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11238,0.080740,0.023624,0.186916,0.227863,0.151424,0.089884,-0.221890,-0.091094,-0.130406,-0.261291,...,-0.034893,0.219204,-0.137004,-0.047590,-0.136549,0.249172,-0.145938,-0.336659,0.030415,-0.061070
11239,0.025660,-0.019617,0.212872,0.166722,0.130383,0.160049,-0.213654,-0.129406,-0.068366,-0.302671,...,-0.052571,0.214512,-0.140285,0.011502,-0.146860,0.179412,-0.173123,-0.297320,-0.007406,-0.066752
11240,0.210877,0.090568,0.175094,0.275844,0.038677,0.020650,-0.116821,-0.125229,-0.215237,-0.165218,...,-0.071957,0.302938,-0.141455,-0.171995,-0.177120,0.283961,-0.064634,-0.291319,-0.018566,-0.020657
11241,0.071189,0.019629,0.213370,0.208463,0.160716,0.121284,-0.204073,-0.100215,-0.106609,-0.293082,...,-0.062641,0.222766,-0.168441,-0.027969,-0.157000,0.203905,-0.138687,-0.313509,0.016646,-0.060254


In [44]:
X_test_cbow = pd.DataFrame(X_test.apply(sent_vect, model=cbow).sort_index())

In [45]:
X_test_cbow = convert_df(X_test_cbow)
X_test_cbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.229052,0.089740,0.195986,0.265431,-0.006031,-0.010653,-0.091910,-0.126556,-0.249633,-0.145820,...,-0.066169,0.306685,-0.134145,-0.200603,-0.191874,0.297304,-0.026670,-0.255334,-0.036733,0.005425
0,0.069430,-0.016187,0.193671,0.233152,0.144982,0.044482,-0.200227,-0.113926,-0.131688,-0.268310,...,-0.028204,0.257530,-0.203193,-0.077521,-0.164237,0.217363,-0.109929,-0.338992,-0.015274,-0.029334
0,0.164029,0.073729,0.144673,0.222471,0.151144,0.078541,-0.209050,-0.093368,-0.121834,-0.187861,...,-0.097796,0.302909,-0.067645,-0.084195,-0.140072,0.347354,-0.148250,-0.330508,0.011487,-0.069974
0,0.074931,0.029382,0.127711,0.208167,0.120668,0.134064,-0.213340,-0.148618,-0.077489,-0.244585,...,-0.066669,0.267883,-0.112311,-0.022620,-0.113588,0.231845,-0.207471,-0.338523,-0.015166,-0.079058
0,0.030049,-0.043330,0.274949,0.144535,0.054198,0.091007,-0.146495,-0.136082,-0.129442,-0.326819,...,0.005426,0.216887,-0.231952,-0.051964,-0.182232,0.119065,-0.097366,-0.258316,-0.035582,-0.008138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.027757,0.028976,0.198936,0.190184,0.178006,0.120154,-0.207207,-0.115679,-0.092229,-0.283381,...,-0.062589,0.212767,-0.179297,-0.023615,-0.166009,0.138644,-0.155425,-0.330208,0.025486,-0.065843
0,0.142575,0.101029,0.101807,0.252169,0.131004,0.077271,-0.228520,-0.096394,-0.122143,-0.169309,...,-0.079083,0.282786,-0.079120,-0.071414,-0.115208,0.318972,-0.201874,-0.366205,0.018585,-0.067642
0,0.078959,0.019989,0.203190,0.179240,0.113979,0.122934,-0.208469,-0.128541,-0.106271,-0.281634,...,-0.051083,0.260463,-0.142652,-0.033458,-0.138051,0.238075,-0.165277,-0.309097,-0.020612,-0.055424
0,0.117153,0.073279,0.147497,0.249551,0.136645,0.073572,-0.225872,-0.101775,-0.137082,-0.216418,...,-0.062271,0.247699,-0.100108,-0.056029,-0.131267,0.286019,-0.156764,-0.355000,0.022938,-0.074697


In [46]:
X_test_cbow = X_test_cbow.reset_index().drop(['index'], axis=1).fillna(0)
X_test_cbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.229052,0.089740,0.195986,0.265431,-0.006031,-0.010653,-0.091910,-0.126556,-0.249633,-0.145820,...,-0.066169,0.306685,-0.134145,-0.200603,-0.191874,0.297304,-0.026670,-0.255334,-0.036733,0.005425
1,0.069430,-0.016187,0.193671,0.233152,0.144982,0.044482,-0.200227,-0.113926,-0.131688,-0.268310,...,-0.028204,0.257530,-0.203193,-0.077521,-0.164237,0.217363,-0.109929,-0.338992,-0.015274,-0.029334
2,0.164029,0.073729,0.144673,0.222471,0.151144,0.078541,-0.209050,-0.093368,-0.121834,-0.187861,...,-0.097796,0.302909,-0.067645,-0.084195,-0.140072,0.347354,-0.148250,-0.330508,0.011487,-0.069974
3,0.074931,0.029382,0.127711,0.208167,0.120668,0.134064,-0.213340,-0.148618,-0.077489,-0.244585,...,-0.066669,0.267883,-0.112311,-0.022620,-0.113588,0.231845,-0.207471,-0.338523,-0.015166,-0.079058
4,0.030049,-0.043330,0.274949,0.144535,0.054198,0.091007,-0.146495,-0.136082,-0.129442,-0.326819,...,0.005426,0.216887,-0.231952,-0.051964,-0.182232,0.119065,-0.097366,-0.258316,-0.035582,-0.008138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3305,0.027757,0.028976,0.198936,0.190184,0.178006,0.120154,-0.207207,-0.115679,-0.092229,-0.283381,...,-0.062589,0.212767,-0.179297,-0.023615,-0.166009,0.138644,-0.155425,-0.330208,0.025486,-0.065843
3306,0.142575,0.101029,0.101807,0.252169,0.131004,0.077271,-0.228520,-0.096394,-0.122143,-0.169309,...,-0.079083,0.282786,-0.079120,-0.071414,-0.115208,0.318972,-0.201874,-0.366205,0.018585,-0.067642
3307,0.078959,0.019989,0.203190,0.179240,0.113979,0.122934,-0.208469,-0.128541,-0.106271,-0.281634,...,-0.051083,0.260463,-0.142652,-0.033458,-0.138051,0.238075,-0.165277,-0.309097,-0.020612,-0.055424
3308,0.117153,0.073279,0.147497,0.249551,0.136645,0.073572,-0.225872,-0.101775,-0.137082,-0.216418,...,-0.062271,0.247699,-0.100108,-0.056029,-0.131267,0.286019,-0.156764,-0.355000,0.022938,-0.074697


## SkipGram

In [81]:
X_train_sg = pd.DataFrame(X_train.apply(sent_vect, model=skg).sort_index())
X_train_sg

Unnamed: 0,tweet
0,"[0.1385263343767096, 0.019514727808935924, 0.1..."
1,"[-0.08006676582537567, 0.14220038026696025, 0...."
2,"[0.12745751758561877, 0.04590809994137651, 0.1..."
3,"[0.148907211855406, 0.045357729192608165, 0.14..."
4,"[0.16997259111404198, 0.02170235144521857, 0.1..."
...,...
9925,"[0.12765205081760966, -0.08407392444930872, 0...."
9926,"[0.0927742904159179, 0.10098635879091483, 0.18..."
9927,"[0.03132195112290745, -0.008785520980705315, 0..."
9928,"[0.12725105628106392, 0.0018205805003424804, 0..."


In [83]:
X_train_sg = convert_df(X_train_sg)
X_train_sg

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.138526,0.019515,0.166428,0.216446,0.138855,-0.055935,-0.304432,-0.105380,-0.253149,-0.145664,...,0.044108,0.247816,-0.014868,-0.132315,-0.070216,0.392828,-0.073529,-0.348592,-0.013779,-0.117852
0,-0.080067,0.142200,0.089483,0.227434,0.094180,-0.101437,-0.283416,-0.271291,-0.158280,-0.155238,...,0.004326,0.131149,0.010339,-0.120501,-0.036472,0.269971,-0.075283,-0.347463,0.135049,-0.180314
0,0.127458,0.045908,0.189589,0.198233,0.063745,0.024450,-0.278034,-0.142304,-0.173098,-0.198859,...,-0.021820,0.279669,-0.020304,-0.031698,-0.058815,0.325629,-0.146991,-0.255908,-0.032594,-0.118372
0,0.148907,0.045358,0.144209,0.225686,0.135249,-0.076277,-0.290739,-0.070622,-0.252579,-0.185700,...,-0.029599,0.221062,0.045011,-0.093727,-0.040805,0.419570,-0.083075,-0.267348,0.001566,-0.162518
0,0.169973,0.021702,0.155327,0.146038,0.122168,-0.014653,-0.339412,-0.174432,-0.236059,-0.159709,...,-0.032499,0.235318,-0.047131,-0.052464,-0.139022,0.347737,-0.113199,-0.292628,-0.094603,-0.110658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.127652,-0.084074,0.174262,0.150152,0.113654,0.040448,-0.155156,-0.126418,-0.140530,-0.202361,...,0.007135,0.391230,0.026749,-0.054714,-0.156879,0.304719,-0.058767,-0.119983,-0.059841,-0.133095
0,0.092774,0.100986,0.181881,0.224044,0.066471,-0.086826,-0.303038,-0.138654,-0.207029,-0.215850,...,0.057712,0.176765,0.016488,-0.034340,-0.111519,0.332314,-0.140747,-0.275698,0.007117,-0.164574
0,0.031322,-0.008786,0.191570,-0.038806,0.202409,-0.133450,-0.266983,-0.178629,-0.201017,-0.175390,...,-0.000081,-0.004056,0.007496,0.073737,-0.116133,0.245116,-0.093048,-0.435757,0.029019,-0.049457
0,0.127251,0.001821,0.158656,0.194201,0.095285,0.002959,-0.245097,-0.140138,-0.183438,-0.190055,...,0.017772,0.333084,-0.034208,-0.092449,-0.104142,0.299297,-0.093698,-0.247580,-0.028847,-0.121487


In [85]:
X_train_sg = X_train_sg.reset_index().drop(['index'], axis=1).fillna(0)
X_train_sg

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.138526,0.019515,0.166428,0.216446,0.138855,-0.055935,-0.304432,-0.105380,-0.253149,-0.145664,...,0.044108,0.247816,-0.014868,-0.132315,-0.070216,0.392828,-0.073529,-0.348592,-0.013779,-0.117852
1,-0.080067,0.142200,0.089483,0.227434,0.094180,-0.101437,-0.283416,-0.271291,-0.158280,-0.155238,...,0.004326,0.131149,0.010339,-0.120501,-0.036472,0.269971,-0.075283,-0.347463,0.135049,-0.180314
2,0.127458,0.045908,0.189589,0.198233,0.063745,0.024450,-0.278034,-0.142304,-0.173098,-0.198859,...,-0.021820,0.279669,-0.020304,-0.031698,-0.058815,0.325629,-0.146991,-0.255908,-0.032594,-0.118372
3,0.148907,0.045358,0.144209,0.225686,0.135249,-0.076277,-0.290739,-0.070622,-0.252579,-0.185700,...,-0.029599,0.221062,0.045011,-0.093727,-0.040805,0.419570,-0.083075,-0.267348,0.001566,-0.162518
4,0.169973,0.021702,0.155327,0.146038,0.122168,-0.014653,-0.339412,-0.174432,-0.236059,-0.159709,...,-0.032499,0.235318,-0.047131,-0.052464,-0.139022,0.347737,-0.113199,-0.292628,-0.094603,-0.110658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9925,0.127652,-0.084074,0.174262,0.150152,0.113654,0.040448,-0.155156,-0.126418,-0.140530,-0.202361,...,0.007135,0.391230,0.026749,-0.054714,-0.156879,0.304719,-0.058767,-0.119983,-0.059841,-0.133095
9926,0.092774,0.100986,0.181881,0.224044,0.066471,-0.086826,-0.303038,-0.138654,-0.207029,-0.215850,...,0.057712,0.176765,0.016488,-0.034340,-0.111519,0.332314,-0.140747,-0.275698,0.007117,-0.164574
9927,0.031322,-0.008786,0.191570,-0.038806,0.202409,-0.133450,-0.266983,-0.178629,-0.201017,-0.175390,...,-0.000081,-0.004056,0.007496,0.073737,-0.116133,0.245116,-0.093048,-0.435757,0.029019,-0.049457
9928,0.127251,0.001821,0.158656,0.194201,0.095285,0.002959,-0.245097,-0.140138,-0.183438,-0.190055,...,0.017772,0.333084,-0.034208,-0.092449,-0.104142,0.299297,-0.093698,-0.247580,-0.028847,-0.121487


In [82]:
X_test_sg = pd.DataFrame(X_test.apply(sent_vect, model=skg).sort_index())
X_test_sg

Unnamed: 0,tweet
0,"[0.18875114073130442, 0.08504719451227362, 0.1..."
1,"[0.05188641031166471, 0.013299822209388771, 0...."
2,"[0.13536494020923442, 0.0696965417494669, 0.18..."
3,"[0.0894113589072249, 0.07385733091233998, 0.25..."
4,"[0.0072756068506350455, -0.026303194567468446,..."
...,...
3305,"[0.0020736232913729324, 0.02083250365649504, 0..."
3306,"[0.08188323487742742, 0.0770203957617591, 0.16..."
3307,"[0.09181048100092186, 0.07948521787336231, 0.2..."
3308,"[0.1345131574072599, 0.07300238704530669, 0.16..."


In [84]:
X_test_sg = convert_df(X_test_sg)
X_test_sg

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.188751,0.085047,0.196057,0.239633,0.008141,-0.116937,-0.242850,-0.122347,-0.285015,-0.094142,...,0.011010,0.237834,-0.019016,-0.185786,-0.102007,0.389863,-0.046258,-0.235252,-0.057892,-0.096117
0,0.051886,0.013300,0.199116,0.219596,0.103695,-0.123744,-0.340428,-0.031157,-0.307546,-0.133739,...,0.075871,0.235019,-0.106251,-0.098781,-0.133301,0.343419,-0.004178,-0.283949,-0.063387,-0.102155
0,0.135365,0.069697,0.184692,0.214738,0.074391,-0.009202,-0.339338,-0.087252,-0.191786,-0.150359,...,0.005173,0.298982,-0.004578,-0.102171,-0.097365,0.357899,-0.110279,-0.246588,-0.044963,-0.097967
0,0.089411,0.073857,0.254223,0.236446,0.032962,0.035375,-0.299110,-0.123629,-0.179272,-0.194257,...,0.004952,0.270153,-0.033486,-0.031937,-0.097374,0.297543,-0.142485,-0.225717,-0.026829,-0.099505
0,0.007276,-0.026303,0.310812,0.187619,-0.046498,0.005140,-0.335359,-0.163222,-0.214707,-0.262072,...,0.128120,0.193110,-0.144317,-0.011589,-0.153744,0.206998,-0.053822,-0.196023,-0.069643,-0.040368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.002074,0.020833,0.233672,0.193114,0.168599,-0.062129,-0.267139,-0.012442,-0.270407,-0.171040,...,0.006042,0.169996,-0.171744,-0.082573,-0.176128,0.228615,-0.149781,-0.174326,0.015865,-0.178080
0,0.081883,0.077020,0.163880,0.256423,0.065097,-0.064530,-0.311738,-0.073156,-0.239171,-0.110516,...,0.023069,0.256452,-0.024246,-0.105840,-0.102386,0.373417,-0.178432,-0.271525,-0.022581,-0.139250
0,0.091810,0.079485,0.261488,0.198871,-0.054910,0.020760,-0.343560,-0.151694,-0.160269,-0.214154,...,0.064576,0.228308,-0.096578,-0.043398,-0.087369,0.302408,-0.134575,-0.220996,-0.000280,-0.121065
0,0.134513,0.073002,0.160120,0.251022,0.049659,-0.014722,-0.296490,-0.078738,-0.213391,-0.175543,...,0.045952,0.185524,-0.007640,-0.089034,-0.060303,0.403693,-0.102535,-0.316395,-0.016215,-0.129192


In [86]:
X_test_sg = X_test_sg.reset_index().drop(['index'], axis=1).fillna(0)
X_test_sg

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.188751,0.085047,0.196057,0.239633,0.008141,-0.116937,-0.242850,-0.122347,-0.285015,-0.094142,...,0.011010,0.237834,-0.019016,-0.185786,-0.102007,0.389863,-0.046258,-0.235252,-0.057892,-0.096117
1,0.051886,0.013300,0.199116,0.219596,0.103695,-0.123744,-0.340428,-0.031157,-0.307546,-0.133739,...,0.075871,0.235019,-0.106251,-0.098781,-0.133301,0.343419,-0.004178,-0.283949,-0.063387,-0.102155
2,0.135365,0.069697,0.184692,0.214738,0.074391,-0.009202,-0.339338,-0.087252,-0.191786,-0.150359,...,0.005173,0.298982,-0.004578,-0.102171,-0.097365,0.357899,-0.110279,-0.246588,-0.044963,-0.097967
3,0.089411,0.073857,0.254223,0.236446,0.032962,0.035375,-0.299110,-0.123629,-0.179272,-0.194257,...,0.004952,0.270153,-0.033486,-0.031937,-0.097374,0.297543,-0.142485,-0.225717,-0.026829,-0.099505
4,0.007276,-0.026303,0.310812,0.187619,-0.046498,0.005140,-0.335359,-0.163222,-0.214707,-0.262072,...,0.128120,0.193110,-0.144317,-0.011589,-0.153744,0.206998,-0.053822,-0.196023,-0.069643,-0.040368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3305,0.002074,0.020833,0.233672,0.193114,0.168599,-0.062129,-0.267139,-0.012442,-0.270407,-0.171040,...,0.006042,0.169996,-0.171744,-0.082573,-0.176128,0.228615,-0.149781,-0.174326,0.015865,-0.178080
3306,0.081883,0.077020,0.163880,0.256423,0.065097,-0.064530,-0.311738,-0.073156,-0.239171,-0.110516,...,0.023069,0.256452,-0.024246,-0.105840,-0.102386,0.373417,-0.178432,-0.271525,-0.022581,-0.139250
3307,0.091810,0.079485,0.261488,0.198871,-0.054910,0.020760,-0.343560,-0.151694,-0.160269,-0.214154,...,0.064576,0.228308,-0.096578,-0.043398,-0.087369,0.302408,-0.134575,-0.220996,-0.000280,-0.121065
3308,0.134513,0.073002,0.160120,0.251022,0.049659,-0.014722,-0.296490,-0.078738,-0.213391,-0.175543,...,0.045952,0.185524,-0.007640,-0.089034,-0.060303,0.403693,-0.102535,-0.316395,-0.016215,-0.129192


In [79]:
X_train_sg_unlab = pd.DataFrame(X_train_unlab.apply(sent_vect, model=skg).sort_index())

In [80]:
X_train_sg_unlab

Unnamed: 0,tweet
0,"[0.1385263343767096, 0.019514727808935924, 0.1..."
1,"[-0.08006676582537567, 0.14220038026696025, 0...."
2,"[0.12745751758561877, 0.04590809994137651, 0.1..."
3,"[0.148907211855406, 0.045357729192608165, 0.14..."
4,"[0.16997259111404198, 0.02170235144521857, 0.1..."
...,...
11238,"[0.0877175979810819, -0.024054452757491043, 0...."
11239,"[0.08084574607521673, 0.0591042073947315, 0.25..."
11240,"[0.2187457234647719, 0.07468842552941364, 0.18..."
11241,"[0.11668345874892752, 0.042231378233500066, 0...."


In [87]:
X_train_sg_unlab = convert_df(X_train_sg_unlab)
X_train_sg_unlab

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.138526,0.019515,0.166428,0.216446,0.138855,-0.055935,-0.304432,-0.105380,-0.253149,-0.145664,...,0.044108,0.247816,-0.014868,-0.132315,-0.070216,0.392828,-0.073529,-0.348592,-0.013779,-0.117852
0,-0.080067,0.142200,0.089483,0.227434,0.094180,-0.101437,-0.283416,-0.271291,-0.158280,-0.155238,...,0.004326,0.131149,0.010339,-0.120501,-0.036472,0.269971,-0.075283,-0.347463,0.135049,-0.180314
0,0.127458,0.045908,0.189589,0.198233,0.063745,0.024450,-0.278034,-0.142304,-0.173098,-0.198859,...,-0.021820,0.279669,-0.020304,-0.031698,-0.058815,0.325629,-0.146991,-0.255908,-0.032594,-0.118372
0,0.148907,0.045358,0.144209,0.225686,0.135249,-0.076277,-0.290739,-0.070622,-0.252579,-0.185700,...,-0.029599,0.221062,0.045011,-0.093727,-0.040805,0.419570,-0.083075,-0.267348,0.001566,-0.162518
0,0.169973,0.021702,0.155327,0.146038,0.122168,-0.014653,-0.339412,-0.174432,-0.236059,-0.159709,...,-0.032499,0.235318,-0.047131,-0.052464,-0.139022,0.347737,-0.113199,-0.292628,-0.094603,-0.110658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.087718,-0.024054,0.203854,0.249087,0.105207,0.015620,-0.366716,-0.017883,-0.237246,-0.238720,...,0.125441,0.186477,-0.056052,-0.066157,-0.042412,0.340199,-0.082934,-0.191944,-0.007212,-0.009766
0,0.080846,0.059104,0.251195,0.231769,0.065487,0.024094,-0.330607,-0.111610,-0.183350,-0.205393,...,-0.017894,0.265083,-0.034696,-0.023888,-0.147687,0.311157,-0.120094,-0.232114,-0.047364,-0.107032
0,0.218746,0.074688,0.188355,0.317208,0.010325,-0.045743,-0.236095,-0.098206,-0.244948,-0.095663,...,0.024123,0.211924,-0.037041,-0.186651,-0.114775,0.412285,-0.036984,-0.267000,-0.027948,-0.129779
0,0.116683,0.042231,0.265538,0.269076,0.120013,0.002366,-0.314538,-0.080609,-0.155711,-0.204829,...,0.012902,0.092975,-0.059500,-0.065757,-0.086546,0.426074,-0.095765,-0.276681,-0.006456,-0.167826


In [88]:
X_train_sg_unlab = X_train_sg_unlab.reset_index().drop(['index'], axis=1).fillna(0)
X_train_sg_unlab

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.138526,0.019515,0.166428,0.216446,0.138855,-0.055935,-0.304432,-0.105380,-0.253149,-0.145664,...,0.044108,0.247816,-0.014868,-0.132315,-0.070216,0.392828,-0.073529,-0.348592,-0.013779,-0.117852
1,-0.080067,0.142200,0.089483,0.227434,0.094180,-0.101437,-0.283416,-0.271291,-0.158280,-0.155238,...,0.004326,0.131149,0.010339,-0.120501,-0.036472,0.269971,-0.075283,-0.347463,0.135049,-0.180314
2,0.127458,0.045908,0.189589,0.198233,0.063745,0.024450,-0.278034,-0.142304,-0.173098,-0.198859,...,-0.021820,0.279669,-0.020304,-0.031698,-0.058815,0.325629,-0.146991,-0.255908,-0.032594,-0.118372
3,0.148907,0.045358,0.144209,0.225686,0.135249,-0.076277,-0.290739,-0.070622,-0.252579,-0.185700,...,-0.029599,0.221062,0.045011,-0.093727,-0.040805,0.419570,-0.083075,-0.267348,0.001566,-0.162518
4,0.169973,0.021702,0.155327,0.146038,0.122168,-0.014653,-0.339412,-0.174432,-0.236059,-0.159709,...,-0.032499,0.235318,-0.047131,-0.052464,-0.139022,0.347737,-0.113199,-0.292628,-0.094603,-0.110658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11238,0.087718,-0.024054,0.203854,0.249087,0.105207,0.015620,-0.366716,-0.017883,-0.237246,-0.238720,...,0.125441,0.186477,-0.056052,-0.066157,-0.042412,0.340199,-0.082934,-0.191944,-0.007212,-0.009766
11239,0.080846,0.059104,0.251195,0.231769,0.065487,0.024094,-0.330607,-0.111610,-0.183350,-0.205393,...,-0.017894,0.265083,-0.034696,-0.023888,-0.147687,0.311157,-0.120094,-0.232114,-0.047364,-0.107032
11240,0.218746,0.074688,0.188355,0.317208,0.010325,-0.045743,-0.236095,-0.098206,-0.244948,-0.095663,...,0.024123,0.211924,-0.037041,-0.186651,-0.114775,0.412285,-0.036984,-0.267000,-0.027948,-0.129779
11241,0.116683,0.042231,0.265538,0.269076,0.120013,0.002366,-0.314538,-0.080609,-0.155711,-0.204829,...,0.012902,0.092975,-0.059500,-0.065757,-0.086546,0.426074,-0.095765,-0.276681,-0.006456,-0.167826


## Training SVM (Supervised only, TF-IDF)

In [47]:
clf = SVC()

In [48]:
params = {'kernel': ['linear', 'rbf', 'sigmoid'], 'C': [0.001, 0.05, 0.01, 0.1, 1, 5, 10, 100], 'gamma': [0.01, 0.1, 1, 5, 10, 100]}

In [49]:
clf.fit(X_train_tf, y_train)

  return f(*args, **kwargs)


SVC()

In [50]:
clf.score(X_test_tf, y_test)

0.66797583081571

In [51]:
gsc = GridSearchCV(clf, param_grid=params, n_jobs=-1)

In [52]:
grid_result = gsc.fit(X_train_tf, y_train)

  return f(*args, **kwargs)


In [53]:
best_params = grid_result.best_params_
best_params

{'C': 1, 'gamma': 0.01, 'kernel': 'linear'}

In [54]:
grid_result.best_score_

0.6715005035246727

In [55]:
clf = SVC(C=best_params['C'], gamma=best_params['gamma'], kernel=best_params['kernel'], probability = True)
clf

SVC(C=1, gamma=0.01, kernel='linear', probability=True)

## Train SVM (Self-Training)

In [56]:
self_training_model = SelfTrainingClassifier(clf) 

In [57]:
X_train_tf_unlab = tfidf.transform(X_train_unlab.sort_index())
print(X_train_tf_unlab)

  (1, 103722)	1.0
  (2, 101960)	0.21961777901669413
  (2, 101959)	0.21961777901669413
  (2, 91080)	0.21961777901669413
  (2, 91078)	0.19845815360331623
  (2, 83676)	0.21961777901669413
  (2, 83675)	0.21961777901669413
  (2, 65724)	0.21961777901669413
  (2, 65723)	0.21961777901669413
  (2, 55820)	0.21961777901669413
  (2, 55819)	0.21961777901669413
  (2, 54901)	0.21961777901669413
  (2, 54900)	0.21961777901669413
  (2, 50963)	0.21961777901669413
  (2, 50962)	0.21961777901669413
  (2, 50961)	0.21961777901669413
  (2, 35384)	0.21961777901669413
  (2, 35383)	0.21961777901669413
  (2, 35382)	0.21025449560967407
  (2, 22432)	0.21961777901669413
  (2, 11342)	0.21961777901669413
  (2, 11341)	0.21961777901669413
  (3, 129226)	0.18324023010631316
  (3, 129225)	0.18324023010631316
  (3, 109123)	0.18324023010631316
  :	:
  (11232, 38289)	0.42422637470878855
  (11232, 37967)	0.457576408013781
  (11232, 34618)	0.457576408013781
  (11232, 12423)	0.457576408013781
  (11233, 45696)	0.5482588539099604
 

In [58]:
self_training_model.fit(X_train_tf_unlab, y_train_unlab)

  return f(*args, **kwargs)


SelfTrainingClassifier(base_estimator=SVC(C=1, gamma=0.01, kernel='linear',
                                          probability=True))

In [59]:
self_training_model.score(X_test_tf, y_test)

0.670392749244713

## Train SVM (CBOW W2V)

In [60]:
clf_w = SVC()

In [61]:
clf_w.fit(X_train_cbow, y_train)

  return f(*args, **kwargs)


SVC()

In [62]:
clf_w.score(X_test_cbow, y_test)

0.6676737160120846

In [65]:
gsc_w2v = GridSearchCV(clf_w, param_grid=params, n_jobs=-1)

In [67]:
grid_res_w2v = gsc_w2v.fit(X_train_cbow, y_train)

  return f(*args, **kwargs)


In [68]:
best_params_w2v = grid_res_w2v.best_params_
best_params_w2v

{'C': 100, 'gamma': 5, 'kernel': 'rbf'}

In [70]:
grid_res_w2v.best_score_

0.6848942598187311

In [72]:
clf_w = SVC(C=best_params_w2v['C'], gamma=best_params_w2v['gamma'], kernel=best_params_w2v['kernel'], probability = True)
clf_w

SVC(C=100, gamma=5, probability=True)

## W2V Self-Training

In [73]:
ssl_w2v = SelfTrainingClassifier(clf_w)

In [74]:
ssl_w2v.fit(X_train_cbow_unlab, y_train_unlab)

  return f(*args, **kwargs)


SelfTrainingClassifier(base_estimator=SVC(C=100, gamma=5, probability=True))

In [75]:
ssl_w2v.score(X_test_cbow, y_test)

0.6779456193353475

## Train SVM (SKG W2V)

In [89]:
clf_skg = SVC()

In [90]:
clf_skg.fit(X_train_sg, y_train)

  return f(*args, **kwargs)


SVC()

In [91]:
clf_skg.score(X_test_sg, y_test)

0.6731117824773414

In [92]:
gsc_w2v_sg = GridSearchCV(clf_w, param_grid=params, n_jobs=-1)

In [95]:
grid_res_w2v_sg = gsc_w2v_sg.fit(X_train_cbow, y_train)

KeyboardInterrupt: 

In [None]:
best_params_w2v_sg = grid_res_w2v_sg.best_params_
best_params_w2v_sg

In [None]:
grid_res_w2v_sg.best_score_

In [None]:
clf_skf = SVC(C=best_params_w2v_sg['C'], gamma=best_params_w2v_sg['gamma'], kernel=best_params_w2v_sg['kernel'], probability = True)
clf_skg

 ## Logistic Regression

### TF-IDF

In [39]:
clf = LogisticRegression()

In [40]:
C = list(np.arange(0, 1, 0.05))

C = [float(i) for i in C]

C = C[1:]

C

[0.05,
 0.1,
 0.15000000000000002,
 0.2,
 0.25,
 0.30000000000000004,
 0.35000000000000003,
 0.4,
 0.45,
 0.5,
 0.55,
 0.6000000000000001,
 0.65,
 0.7000000000000001,
 0.75,
 0.8,
 0.8500000000000001,
 0.9,
 0.9500000000000001]

In [41]:
scores = dict()

In [44]:
for i in C:
    clf = LogisticRegression(C= i, max_iter=1000)
    clf.fit(X_train_tf, y_train)
    scores[i] = clf.score(X_test_tf, y_test)
    
scores

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


{0.05: 0.6676737160120846,
 0.1: 0.6676737160120846,
 0.15000000000000002: 0.6676737160120846,
 0.2: 0.6676737160120846,
 0.25: 0.6676737160120846,
 0.30000000000000004: 0.6676737160120846,
 0.35000000000000003: 0.6676737160120846,
 0.4: 0.6676737160120846,
 0.45: 0.6676737160120846,
 0.5: 0.6676737160120846,
 0.55: 0.6676737160120846,
 0.6000000000000001: 0.66797583081571,
 0.65: 0.66797583081571,
 0.7000000000000001: 0.6685800604229607,
 0.75: 0.6685800604229607,
 0.8: 0.6685800604229607,
 0.8500000000000001: 0.6685800604229607,
 0.9: 0.6691842900302115,
 0.9500000000000001: 0.6691842900302115}

In [45]:
best_C = max(scores, key=scores.get)

print(best_C)

0.9


In [46]:
clf = LogisticRegression(C=best_C)

In [47]:
clf.fit(X_train_tf, y_train)

  return f(*args, **kwargs)


LogisticRegression(C=0.9)

In [48]:
clf.score(X_test_tf, y_test)

0.6691842900302115