In [1]:
from classifiers import CoTrainingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import pickle

## Load Models

### TF-IDF

In [2]:
with open ("tfidf.pk", 'rb') as fin:
    tfidf = pickle.load(fin)

In [3]:
tfidf

TfidfVectorizer(max_df=5, ngram_range=(1, 3), stop_words='english')

### Word2Vec CBOW

In [4]:
cbow = Word2Vec.load("cbow.model")

In [5]:
cbow

<gensim.models.word2vec.Word2Vec at 0x225be394640>

### Word2Vec SkipGram

In [6]:
skg = Word2Vec.load("skg.model")

In [7]:
skg

<gensim.models.word2vec.Word2Vec at 0x225bfddf940>

## Importing Data

In [8]:
df = pd.read_csv("processed_train.csv")

In [9]:
df

Unnamed: 0.1,Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,0,86426,"['ask', 'native', 'american', 'take']",OFF,UNT,
1,1,90194,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF,TIN,IND
2,2,16820,"['amazon', 'investigating', 'chinese', 'employ...",NOT,,
3,3,62688,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF,UNT,
4,4,43605,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT,,
...,...,...,...,...,...,...
13235,13235,95338,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF,TIN,IND
13236,13236,67210,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT,,
13237,13237,82921,"['report', 'garbage', 'dont', 'give', 'crap']",OFF,TIN,OTH
13238,13238,27429,['pussy'],OFF,UNT,


### Removing Redundant Axes

In [10]:
df = df.drop(['Unnamed: 0', 'subtask_b', 'subtask_c', 'id'], axis=1)

In [11]:
df

Unnamed: 0,tweet,subtask_a
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


### Renaming Columns

In [12]:
df = df.rename(columns={'subtask_a': 'Offensive'})

In [13]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


### Converting Offensive to Numerical Value

In [14]:
def off(cls):
    if cls =='OFF':
        return 1
    return 0

In [15]:
df["Offensive"] = df["Offensive"].apply(off)

In [16]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",1
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",1
2,"['amazon', 'investigating', 'chinese', 'employ...",0
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",1
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",0
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",1
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",0
13237,"['report', 'garbage', 'dont', 'give', 'crap']",1
13238,['pussy'],1


In [17]:
df['tweet']

0                    ['ask', 'native', 'american', 'take']
1        ['go', 'home', '’', 'drunk', 'maga', 'trump', ...
2        ['amazon', 'investigating', 'chinese', 'employ...
3        ['someone', 'shouldve', 'taken', 'piece', 'shi...
4        ['obama', 'wanted', 'liberal', 'amp', 'illegal...
                               ...                        
13235    ['sometimes', 'get', 'strong', 'vibe', 'people...
13236    ['benidorm', '✅', 'creamfields', '✅', 'maga', ...
13237        ['report', 'garbage', 'dont', 'give', 'crap']
13238                                            ['pussy']
13239    ['spanishrevenge', 'v', 'justice', 'human', 'r...
Name: tweet, Length: 13240, dtype: object

## Fitting Word Models on Data

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['Offensive'], stratify=df['Offensive'], random_state=0)

In [19]:
X_train = X_train.sort_index().reset_index().drop(['index'], axis=1)

In [20]:
X_train = X_train['tweet']

In [21]:
X_train

0                   ['ask', 'native', 'american', 'take']
1       ['go', 'home', '’', 'drunk', 'maga', 'trump', ...
2       ['obama', 'wanted', 'liberal', 'amp', 'illegal...
3                                   ['liberal', 'kookoo']
4       ['literally', 'talking', 'lol', 'mass', 'shoot...
                              ...                        
9925                       ['brightest', 'light', 'tree']
9926    ['sometimes', 'get', 'strong', 'vibe', 'people...
9927    ['benidorm', '✅', 'creamfields', '✅', 'maga', ...
9928        ['report', 'garbage', 'dont', 'give', 'crap']
9929    ['spanishrevenge', 'v', 'justice', 'human', 'r...
Name: tweet, Length: 9930, dtype: object

In [22]:
X_test = X_test.sort_index().reset_index().drop(['index'], axis=1)

In [23]:
X_test = X_test['tweet']

In [24]:
X_test

0       ['amazon', 'investigating', 'chinese', 'employ...
1       ['someone', 'shouldve', 'taken', 'piece', 'shi...
2                           ['oh', 'no', 'tough', 'shit']
3       ['canada', '’', 'need', 'another', 'cuck', 'al...
4       ['da', 'fuck', 'going', 'people', 'there', 'me...
                              ...                        
3305    ['conservative', 'accepted', 'antisemitism', '...
3306    ['much', 'lonely', 'much', 'miss', 'azeez', 'd...
3307    ['😂', '😂', '😂', 'say', 'mad', 'say', 'im', 'ti...
3308    ['retweet', 'complete', 'amp', 'followed', 'pa...
3309                                            ['pussy']
Name: tweet, Length: 3310, dtype: object

In [25]:
y_train = y_train.reset_index().drop(['index'], axis=1)
y_train

Unnamed: 0,Offensive
0,0
1,0
2,0
3,0
4,0
...,...
9925,0
9926,1
9927,0
9928,0


In [26]:
y_train

Unnamed: 0,Offensive
0,0
1,0
2,0
3,0
4,0
...,...
9925,0
9926,1
9927,0
9928,0


In [27]:
rng = np.random.RandomState(42)
random_unlabeled_points = rng.rand(y_train.shape[0]) < 0.3

In [28]:
random_unlabeled_points

array([False, False, False, ..., False, False, False])

In [29]:
y_train_unlab = y_train.copy()

In [30]:
y_train_unlab[random_unlabeled_points] = -1

In [31]:
y_train_unlab[random_unlabeled_points]

Unnamed: 0,Offensive
4,-1
5,-1
6,-1
10,-1
13,-1
...,...
9916,-1
9917,-1
9918,-1
9920,-1


In [32]:
y_train_unlab

Unnamed: 0,Offensive
0,0
1,0
2,0
3,0
4,-1
...,...
9925,-1
9926,1
9927,0
9928,0


### TF-IDF

In [33]:
X_train_tf = pd.DataFrame(tfidf.transform(X_train.sort_index()).todense())
print(X_train_tf)

      0       1       2       3       4       5       6       7       8       \
0        0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1        0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2        0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3        0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4        0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
...      ...     ...     ...     ...     ...     ...     ...     ...     ...   
9925     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
9926     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
9927     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
9928     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
9929     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

      9       ...  153232  153233  1532

In [34]:
X_test_tf = pd.DataFrame(tfidf.transform(X_test.sort_index()).todense())

In [35]:
X_test_tf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153232,153233,153234,153235,153236,153237,153238,153239,153240,153241
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Sentence Vectoriser

Finding the average vector for a given document

In [36]:
#Sentence vectoriser - Average of sentence for each vector
def sent_vect(sent, model):
    sent = eval(sent)
    vec = np.zeros(32)
    num = 0
    for w in sent:
        try:
            temp_vec = model.wv.get_vector(w)
            #print(temp_vec)
            vec = np.add(vec, temp_vec)
            num += 1
        except:
            pass
        
    return vec / np.sqrt(vec.dot(vec))

In [37]:
def convert_df(df):
    df_new = pd.DataFrame(columns=list(range(32)))
    for ind in df.index:
        #print(df.iloc[ind][0])
        df_test = pd.DataFrame([[k for k in df.iloc[ind][0]]], columns=list(range(32)))
        #print(df_test)
        df_new = df_new.append(df_test)
            
    return df_new

### CBOW

In [38]:
X_train_cbow = pd.DataFrame(X_train.apply(sent_vect, model=cbow).sort_index())

In [39]:
X_train_cbow

Unnamed: 0,tweet
0,"[0.11678230874736859, 0.04701858112529256, 0.1..."
1,"[0.023887964134799274, -0.04221032997250825, 0..."
2,"[0.023089552048990755, -0.036891745280832305, ..."
3,"[0.08365669444110241, 0.024382291167563585, 0...."
4,"[0.1692174577116111, -0.028866439854166506, 0...."
...,...
9925,"[0.05910713569141994, 0.03033006832724931, 0.2..."
9926,"[0.1772960903524207, 0.06816052902725968, 0.19..."
9927,"[-0.07846772515660165, -0.09172202196583056, 0..."
9928,"[0.1720133490472096, 0.0603833914465248, 0.166..."


In [40]:
X_train_cbow = convert_df(X_train_cbow)
X_train_cbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.116782,0.047019,0.186344,0.170407,0.134305,0.163919,-0.212694,-0.093542,-0.087308,-0.240148,...,-0.093488,0.294705,-0.073186,-0.020457,-0.125274,0.287744,-0.201510,-0.307307,-0.028852,-0.065701
0,0.023888,-0.042210,0.296161,0.128866,0.025281,0.080379,-0.139509,-0.127690,-0.137655,-0.331031,...,0.013264,0.207271,-0.242064,-0.051973,-0.192782,0.107341,-0.092331,-0.240488,-0.042023,0.010290
0,0.023090,-0.036892,0.235128,0.128961,0.112560,0.167412,-0.181086,-0.130447,-0.075992,-0.326548,...,-0.043798,0.240976,-0.160064,0.002223,-0.138658,0.157358,-0.168503,-0.275676,-0.046759,-0.050284
0,0.083657,0.024382,0.166210,0.168800,0.160186,0.173134,-0.204819,-0.132779,-0.070912,-0.243716,...,-0.092422,0.288180,-0.085039,-0.012848,-0.119140,0.248484,-0.185667,-0.290868,-0.038875,-0.081476
0,0.169217,-0.028866,0.163223,0.203960,0.109067,0.061999,-0.110513,-0.128897,-0.143468,-0.223752,...,-0.093445,0.341903,-0.150574,-0.133993,-0.140724,0.306059,-0.090172,-0.319784,-0.038777,-0.043028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.059107,0.030330,0.202580,0.188432,0.150651,0.176885,-0.187442,-0.102763,-0.073841,-0.278436,...,-0.058418,0.271727,-0.141781,-0.024655,-0.132669,0.208250,-0.164911,-0.299155,-0.017788,-0.069534
0,0.177296,0.068161,0.194752,0.240950,0.035832,0.077722,-0.152960,-0.114388,-0.186071,-0.184646,...,-0.073314,0.319235,-0.095249,-0.127521,-0.158090,0.310651,-0.115591,-0.275920,-0.053974,-0.016020
0,-0.078468,-0.091722,0.315203,0.050650,0.062632,0.169709,-0.155772,-0.098118,-0.070093,-0.375598,...,0.019364,0.116853,-0.232294,0.032055,-0.174815,0.035444,-0.101746,-0.179466,-0.038296,0.004396
0,0.172013,0.060383,0.166888,0.227401,0.167757,0.066254,-0.235538,-0.044743,-0.144300,-0.196327,...,-0.076482,0.280510,-0.047609,-0.054018,-0.131983,0.375978,-0.165397,-0.310490,0.012297,-0.069596


In [41]:
X_train_cbow = X_train_cbow.reset_index().drop(['index'], axis=1).fillna(0)
X_train_cbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.116782,0.047019,0.186344,0.170407,0.134305,0.163919,-0.212694,-0.093542,-0.087308,-0.240148,...,-0.093488,0.294705,-0.073186,-0.020457,-0.125274,0.287744,-0.201510,-0.307307,-0.028852,-0.065701
1,0.023888,-0.042210,0.296161,0.128866,0.025281,0.080379,-0.139509,-0.127690,-0.137655,-0.331031,...,0.013264,0.207271,-0.242064,-0.051973,-0.192782,0.107341,-0.092331,-0.240488,-0.042023,0.010290
2,0.023090,-0.036892,0.235128,0.128961,0.112560,0.167412,-0.181086,-0.130447,-0.075992,-0.326548,...,-0.043798,0.240976,-0.160064,0.002223,-0.138658,0.157358,-0.168503,-0.275676,-0.046759,-0.050284
3,0.083657,0.024382,0.166210,0.168800,0.160186,0.173134,-0.204819,-0.132779,-0.070912,-0.243716,...,-0.092422,0.288180,-0.085039,-0.012848,-0.119140,0.248484,-0.185667,-0.290868,-0.038875,-0.081476
4,0.169217,-0.028866,0.163223,0.203960,0.109067,0.061999,-0.110513,-0.128897,-0.143468,-0.223752,...,-0.093445,0.341903,-0.150574,-0.133993,-0.140724,0.306059,-0.090172,-0.319784,-0.038777,-0.043028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9925,0.059107,0.030330,0.202580,0.188432,0.150651,0.176885,-0.187442,-0.102763,-0.073841,-0.278436,...,-0.058418,0.271727,-0.141781,-0.024655,-0.132669,0.208250,-0.164911,-0.299155,-0.017788,-0.069534
9926,0.177296,0.068161,0.194752,0.240950,0.035832,0.077722,-0.152960,-0.114388,-0.186071,-0.184646,...,-0.073314,0.319235,-0.095249,-0.127521,-0.158090,0.310651,-0.115591,-0.275920,-0.053974,-0.016020
9927,-0.078468,-0.091722,0.315203,0.050650,0.062632,0.169709,-0.155772,-0.098118,-0.070093,-0.375598,...,0.019364,0.116853,-0.232294,0.032055,-0.174815,0.035444,-0.101746,-0.179466,-0.038296,0.004396
9928,0.172013,0.060383,0.166888,0.227401,0.167757,0.066254,-0.235538,-0.044743,-0.144300,-0.196327,...,-0.076482,0.280510,-0.047609,-0.054018,-0.131983,0.375978,-0.165397,-0.310490,0.012297,-0.069596


In [42]:
X_test_cbow = pd.DataFrame(X_test.apply(sent_vect, model=cbow).sort_index())

In [43]:
X_test_cbow

Unnamed: 0,tweet
0,"[-0.05144699518391731, -0.07000303002084465, 0..."
1,"[0.10839988236478272, -0.02007087490801532, 0...."
2,"[0.14215383139067395, 0.09612889083374788, 0.0..."
3,"[0.14834794895835923, 0.041729775171655985, 0...."
4,"[0.10737067888646858, 0.06788053575314568, 0.0..."
...,...
3305,"[0.07261746754903796, 0.023324642599656067, 0...."
3306,"[0.11278004817370701, 0.06896479115835308, 0.1..."
3307,"[0.11422348815947328, -0.03982285937728821, 0...."
3308,"[-0.04389888644976187, -0.1105844745764699, 0...."


In [44]:
X_test_cbow = convert_df(X_test_cbow)
X_test_cbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,-0.051447,-0.070003,0.286287,0.097101,0.087439,0.166264,-0.174004,-0.109848,-0.071969,-0.364775,...,-0.007415,0.143692,-0.206592,0.029059,-0.158642,0.070883,-0.117583,-0.226103,-0.030672,-0.018225
0,0.108400,-0.020071,0.162912,0.323339,0.206559,0.033843,-0.218606,-0.020973,-0.151966,-0.220563,...,-0.043556,0.122276,-0.066789,-0.047993,-0.117554,0.331776,-0.084798,-0.346584,0.103901,-0.066880
0,0.142154,0.096129,0.079983,0.270342,0.153216,0.012640,-0.227744,-0.138585,-0.140370,-0.185753,...,-0.054763,0.263931,-0.102013,-0.073038,-0.116792,0.311962,-0.156150,-0.375516,0.058700,-0.091293
0,0.148348,0.041730,0.199145,0.197877,0.074580,0.101095,-0.155317,-0.128384,-0.150495,-0.231633,...,-0.085451,0.317811,-0.121901,-0.084993,-0.157564,0.278284,-0.139614,-0.300792,-0.050274,-0.042197
0,0.107371,0.067881,0.092931,0.247173,0.109462,0.085713,-0.240435,-0.136515,-0.108982,-0.186817,...,-0.062731,0.275336,-0.071241,-0.046408,-0.107568,0.305355,-0.210156,-0.353594,-0.008699,-0.071576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.072617,0.023325,0.160082,0.201115,0.130387,0.112638,-0.201119,-0.136511,-0.104858,-0.259094,...,-0.066367,0.266340,-0.125081,-0.036310,-0.131784,0.235072,-0.175173,-0.338598,-0.021551,-0.070379
0,0.112780,0.068965,0.113599,0.226441,0.153955,0.090157,-0.234167,-0.126425,-0.103316,-0.197622,...,-0.086780,0.271169,-0.076589,-0.039374,-0.125073,0.300092,-0.174704,-0.365590,0.028093,-0.091467
0,0.114223,-0.039823,0.175243,0.338029,0.244294,0.022442,-0.219556,0.031045,-0.154848,-0.201571,...,-0.061341,0.070049,-0.022039,-0.036127,-0.119665,0.372985,-0.046273,-0.325260,0.137812,-0.064244
0,-0.043899,-0.110584,0.284467,0.066558,0.084608,0.070571,-0.152744,-0.103706,-0.083297,-0.352902,...,0.054939,0.170552,-0.262289,-0.000557,-0.177256,0.038478,-0.110279,-0.251436,-0.016056,0.002755


In [45]:
X_test_cbow = X_test_cbow.reset_index().drop(['index'], axis=1).fillna(0)
X_test_cbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,-0.051447,-0.070003,0.286287,0.097101,0.087439,0.166264,-0.174004,-0.109848,-0.071969,-0.364775,...,-0.007415,0.143692,-0.206592,0.029059,-0.158642,0.070883,-0.117583,-0.226103,-0.030672,-0.018225
1,0.108400,-0.020071,0.162912,0.323339,0.206559,0.033843,-0.218606,-0.020973,-0.151966,-0.220563,...,-0.043556,0.122276,-0.066789,-0.047993,-0.117554,0.331776,-0.084798,-0.346584,0.103901,-0.066880
2,0.142154,0.096129,0.079983,0.270342,0.153216,0.012640,-0.227744,-0.138585,-0.140370,-0.185753,...,-0.054763,0.263931,-0.102013,-0.073038,-0.116792,0.311962,-0.156150,-0.375516,0.058700,-0.091293
3,0.148348,0.041730,0.199145,0.197877,0.074580,0.101095,-0.155317,-0.128384,-0.150495,-0.231633,...,-0.085451,0.317811,-0.121901,-0.084993,-0.157564,0.278284,-0.139614,-0.300792,-0.050274,-0.042197
4,0.107371,0.067881,0.092931,0.247173,0.109462,0.085713,-0.240435,-0.136515,-0.108982,-0.186817,...,-0.062731,0.275336,-0.071241,-0.046408,-0.107568,0.305355,-0.210156,-0.353594,-0.008699,-0.071576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3305,0.072617,0.023325,0.160082,0.201115,0.130387,0.112638,-0.201119,-0.136511,-0.104858,-0.259094,...,-0.066367,0.266340,-0.125081,-0.036310,-0.131784,0.235072,-0.175173,-0.338598,-0.021551,-0.070379
3306,0.112780,0.068965,0.113599,0.226441,0.153955,0.090157,-0.234167,-0.126425,-0.103316,-0.197622,...,-0.086780,0.271169,-0.076589,-0.039374,-0.125073,0.300092,-0.174704,-0.365590,0.028093,-0.091467
3307,0.114223,-0.039823,0.175243,0.338029,0.244294,0.022442,-0.219556,0.031045,-0.154848,-0.201571,...,-0.061341,0.070049,-0.022039,-0.036127,-0.119665,0.372985,-0.046273,-0.325260,0.137812,-0.064244
3308,-0.043899,-0.110584,0.284467,0.066558,0.084608,0.070571,-0.152744,-0.103706,-0.083297,-0.352902,...,0.054939,0.170552,-0.262289,-0.000557,-0.177256,0.038478,-0.110279,-0.251436,-0.016056,0.002755


### SkipGram

In [46]:
X_train_skg = X_train.apply(sent_vect, model=skg).sort_index()

In [47]:
X_train_skg

0       [0.13890833028577879, 0.0602093866169931, 0.25...
1       [-0.031044015373555757, -0.06391622386404926, ...
2       [0.009941511388440752, 0.006697370287529366, 0...
3       [0.05088327548790825, 0.12449198214735306, 0.2...
4       [0.20541999370674277, -0.011201584641294629, 0...
                              ...                        
9925    [0.0820706311814619, 0.05471222033757831, 0.22...
9926    [0.10886295323000372, 0.06637772834474372, 0.1...
9927    [-0.04250946890486125, -0.005367833791252949, ...
9928    [0.18586203624193126, 0.07148610685130281, 0.2...
9929    [0.02059650501444647, -0.010558274514424206, 0...
Name: tweet, Length: 9930, dtype: object

In [48]:
X_test_skg = X_test.apply(sent_vect, model=skg).sort_index()

In [49]:
X_test_skg

0       [-0.020249622204438412, -0.01119598407878388, ...
1       [0.1714072956388892, -0.03793757609649255, 0.1...
2       [0.1370963143540046, 0.09248017762336874, 0.13...
3       [0.10742154726732796, 0.04232172938983719, 0.2...
4       [0.08625167256161481, 0.07987306727597052, 0.1...
                              ...                        
3305    [0.08095425708329977, 0.10200269965214329, 0.1...
3306    [0.08540180817933385, 0.029416859234483287, 0....
3307    [0.1665732372459549, -0.09866697941966471, 0.1...
3308    [0.014044533632225758, -0.12783930366160623, 0...
3309    [0.10757330260668012, 0.09901826002879813, 0.1...
Name: tweet, Length: 3310, dtype: object

## Co - Training

### SVM vs SVM

In [50]:
svm2 = CoTrainingClassifier(SVC())

In [51]:
svm2.fit(X_train_tf, X_train_cbow, y_train_unlab)

  return f(*args, **kwargs)


ValueError: Found input variables with inconsistent numbers of samples: [9930, 6898]