In [1]:
import numpy as np
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.semi_supervised import LabelPropagation
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
import pandas as pd
import pickle

### Load models

In [2]:
with open ("tfidf.pk", 'rb') as fin:
    tfidf = pickle.load(fin)

In [3]:
tfidf

TfidfVectorizer(max_df=5, ngram_range=(1, 3), stop_words='english')

In [4]:
cbow = Word2Vec.load("cbow.model")

In [5]:
cbow

<gensim.models.word2vec.Word2Vec at 0x106b12130>

## Import data

In [6]:
df = pd.read_csv("processed_train.csv")

In [7]:
df

Unnamed: 0.1,Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,0,86426,"['ask', 'native', 'american', 'take']",OFF,UNT,
1,1,90194,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF,TIN,IND
2,2,16820,"['amazon', 'investigating', 'chinese', 'employ...",NOT,,
3,3,62688,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF,UNT,
4,4,43605,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT,,
...,...,...,...,...,...,...
13235,13235,95338,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF,TIN,IND
13236,13236,67210,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT,,
13237,13237,82921,"['report', 'garbage', 'dont', 'give', 'crap']",OFF,TIN,OTH
13238,13238,27429,['pussy'],OFF,UNT,


## Removing Redundant Axes

In [8]:
df = df.drop(['Unnamed: 0', 'subtask_b', 'subtask_c', 'id'], axis=1)

In [9]:
df

Unnamed: 0,tweet,subtask_a
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


## Renaming Columns

In [10]:
df = df.rename(columns={'subtask_a': 'Offensive'})

In [11]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


## Converting Offensive to Numerical Value

In [12]:
def off(cls):
    if cls =='OFF':
        return 1
    elif cls == 'NOT':
        return 0

In [13]:
df["Offensive"] = df["Offensive"].apply(off)

In [14]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",1
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",1
2,"['amazon', 'investigating', 'chinese', 'employ...",0
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",1
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",0
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",1
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",0
13237,"['report', 'garbage', 'dont', 'give', 'crap']",1
13238,['pussy'],1


In [15]:
df['tweet']

0                    ['ask', 'native', 'american', 'take']
1        ['go', 'home', '’', 'drunk', 'maga', 'trump', ...
2        ['amazon', 'investigating', 'chinese', 'employ...
3        ['someone', 'shouldve', 'taken', 'piece', 'shi...
4        ['obama', 'wanted', 'liberal', 'amp', 'illegal...
                               ...                        
13235    ['sometimes', 'get', 'strong', 'vibe', 'people...
13236    ['benidorm', '✅', 'creamfields', '✅', 'maga', ...
13237        ['report', 'garbage', 'dont', 'give', 'crap']
13238                                            ['pussy']
13239    ['spanishrevenge', 'v', 'justice', 'human', 'r...
Name: tweet, Length: 13240, dtype: object

## Splitting into train and test

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['Offensive'], stratify=df['Offensive'], random_state=0)

In [17]:
X_train = X_train.reset_index().drop(['index'], axis=1)
X_train

Unnamed: 0,tweet
0,"['friend', 'father', 'safe', 'talk', '”']"
1,"['president', 'im']"
2,"['depending', 'government', 'politician', 'fur..."
3,"['im', 'sure', 'really', 'low', 'carb', 'diet'..."
4,"['welcome', 'why', 'james', 'cryin']"
...,...
9925,"['nra', 'supported', 'gun', 'control', 'reagan..."
9926,"['quite', 'good', 'faking', 'must', 'done', 'n..."
9927,['thank']
9928,"['abundantly', 'clear', 'lack', 'common', 'sen..."


In [18]:
X_train = X_train['tweet']

In [19]:
X_train

0               ['friend', 'father', 'safe', 'talk', '”']
1                                     ['president', 'im']
2       ['depending', 'government', 'politician', 'fur...
3       ['im', 'sure', 'really', 'low', 'carb', 'diet'...
4                    ['welcome', 'why', 'james', 'cryin']
                              ...                        
9925    ['nra', 'supported', 'gun', 'control', 'reagan...
9926    ['quite', 'good', 'faking', 'must', 'done', 'n...
9927                                            ['thank']
9928    ['abundantly', 'clear', 'lack', 'common', 'sen...
9929                 ['there', 'brexit', '👇', '🏻', 'url']
Name: tweet, Length: 9930, dtype: object

In [20]:
X_test = X_test.reset_index().drop(['index'], axis=1)
X_test

Unnamed: 0,tweet
0,"['’', 'still', 'listening', '—', 'hear', 'mumb..."
1,"['well', 'born', '“', '”', 'dude', 'url']"
2,"['surprised', 'know', 'liberal', 'sacrifice', ..."
3,"['volient', 'action', 'done', 'defend', 'one',..."
4,"['🛑', 'truthfeed', 'news', '🛑', '👉', 'fbi', 'd..."
...,...
3305,"['bird', 'as', 'url']"
3306,"['divert', 'defamewe', 'voted', 'get', 'thing'..."
3307,"['alumnus', 'standing', 'christine', 'blasey',..."
3308,"['wow', 'didnt', 'look', 'permanently', 'suspe..."


In [21]:
X_test = X_test['tweet']
X_test

0       ['’', 'still', 'listening', '—', 'hear', 'mumb...
1               ['well', 'born', '“', '”', 'dude', 'url']
2       ['surprised', 'know', 'liberal', 'sacrifice', ...
3       ['volient', 'action', 'done', 'defend', 'one',...
4       ['🛑', 'truthfeed', 'news', '🛑', '👉', 'fbi', 'd...
                              ...                        
3305                                ['bird', 'as', 'url']
3306    ['divert', 'defamewe', 'voted', 'get', 'thing'...
3307    ['alumnus', 'standing', 'christine', 'blasey',...
3308    ['wow', 'didnt', 'look', 'permanently', 'suspe...
3309                      ['thank', 'daughter', 'future']
Name: tweet, Length: 3310, dtype: object

In [22]:
y_train = y_train.reset_index().drop(['index'], axis=1)
y_train

Unnamed: 0,Offensive
0,0
1,0
2,0
3,0
4,0
...,...
9925,0
9926,1
9927,0
9928,0


In [23]:
y_test = y_test.reset_index().drop(['index'], axis=1)
y_test

Unnamed: 0,Offensive
0,0
1,0
2,0
3,0
4,1
...,...
3305,1
3306,0
3307,1
3308,1


In [24]:
rng = np.random.RandomState(42)
random_unlabeled_points = rng.rand(y_train.shape[0]) < 0.3

In [25]:
random_unlabeled_points

array([False, False, False, ..., False, False, False])

In [26]:
y_train_unlab = y_train.copy()

In [27]:
y_train_unlab[random_unlabeled_points] = -1

In [28]:
y_train_unlab[random_unlabeled_points]

Unnamed: 0,Offensive
4,-1
5,-1
6,-1
10,-1
13,-1
...,...
9916,-1
9917,-1
9918,-1
9920,-1


In [29]:
y_train

Unnamed: 0,Offensive
0,0
1,0
2,0
3,0
4,0
...,...
9925,0
9926,1
9927,0
9928,0


In [30]:
y_train_unlab

Unnamed: 0,Offensive
0,0
1,0
2,0
3,0
4,-1
...,...
9925,-1
9926,1
9927,0
9928,0


### TF-IDF

In [31]:
X_train_tf = tfidf.transform(X_train.sort_index())
print(X_train_tf)

  (1, 103722)	1.0
  (2, 101960)	0.21961777901669413
  (2, 101959)	0.21961777901669413
  (2, 91080)	0.21961777901669413
  (2, 91078)	0.19845815360331623
  (2, 83676)	0.21961777901669413
  (2, 83675)	0.21961777901669413
  (2, 65724)	0.21961777901669413
  (2, 65723)	0.21961777901669413
  (2, 55820)	0.21961777901669413
  (2, 55819)	0.21961777901669413
  (2, 54901)	0.21961777901669413
  (2, 54900)	0.21961777901669413
  (2, 50963)	0.21961777901669413
  (2, 50962)	0.21961777901669413
  (2, 50961)	0.21961777901669413
  (2, 35384)	0.21961777901669413
  (2, 35383)	0.21961777901669413
  (2, 35382)	0.21025449560967407
  (2, 22432)	0.21961777901669413
  (2, 11342)	0.21961777901669413
  (2, 11341)	0.21961777901669413
  (3, 129226)	0.18324023010631316
  (3, 129225)	0.18324023010631316
  (3, 109123)	0.18324023010631316
  :	:
  (9925, 93097)	0.22597647355007702
  (9925, 56718)	0.22597647355007702
  (9925, 54955)	0.22597647355007702
  (9925, 54954)	0.21634209069345273
  (9925, 28521)	0.22597647355007702

In [32]:
X_test_tf = tfidf.transform(X_test.sort_index())

### CBOW

In [33]:
#Sentence vectoriser - Average of sentence for each vector
def sent_vect(sent, model):
    sent = eval(sent)
    vec = np.zeros(32)
    num = 0
    for w in sent:
        try:
            temp_vec = model.wv.get_vector(w)
            #print(temp_vec)
            vec = np.add(vec, temp_vec)
            num += 1
        except:
            pass
        
    return vec / np.sqrt(vec.dot(vec))

In [34]:
X_train_cbow = pd.DataFrame(X_train.apply(sent_vect, model=cbow).sort_index())
X_train_cbow

Unnamed: 0,tweet
0,"[0.10371455167512132, -0.0026690812901859584, ..."
1,"[0.07306809735589292, 0.10742395934446738, 0.1..."
2,"[0.12152500412207402, 0.03987473932714873, 0.1..."
3,"[0.15234790567554005, 0.07549362942942803, 0.1..."
4,"[0.17659919052198297, 0.07970519122554563, 0.0..."
...,...
9925,"[0.1268052961219268, -0.15723430540881764, 0.1..."
9926,"[0.11029270031188039, 0.08434713848718038, 0.1..."
9927,"[0.07075947084083557, 0.020389900997694718, 0...."
9928,"[0.09825704417810942, -0.09855605385106399, 0...."


In [35]:
def convert_df(df):
    df_new = pd.DataFrame(columns=list(range(32)))
    for ind in df.index:
        #print(df.iloc[ind][0])
        df_test = pd.DataFrame([[k for k in df.iloc[ind][0]]], columns=list(range(32)))
        #print(df_test)
        df_new = df_new.append(df_test)
            
    return df_new

In [36]:
X_train_cbow = convert_df(X_train_cbow)
X_train_cbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.103715,-0.002669,0.153950,0.237088,0.150675,0.039473,-0.195056,-0.123852,-0.135551,-0.243535,...,-0.040841,0.295382,-0.165185,-0.095019,-0.135421,0.270145,-0.136805,-0.360417,-0.013511,-0.042050
0,0.073068,0.107424,0.124951,0.210843,0.136943,0.112223,-0.216901,-0.158670,-0.104263,-0.230354,...,-0.075750,0.237339,-0.120083,-0.047526,-0.119075,0.238800,-0.165567,-0.363928,0.031739,-0.097591
0,0.121525,0.039875,0.136009,0.191058,0.122848,0.136994,-0.186814,-0.126776,-0.084429,-0.229106,...,-0.112223,0.311797,-0.094857,-0.041932,-0.116575,0.279989,-0.198768,-0.341706,-0.029827,-0.073104
0,0.152348,0.075494,0.128314,0.248046,0.166745,0.046800,-0.235776,-0.084590,-0.143277,-0.183683,...,-0.076028,0.267752,-0.061788,-0.067676,-0.124432,0.360069,-0.151870,-0.352549,0.040473,-0.085711
0,0.176599,0.079705,0.089254,0.251721,0.144600,0.004873,-0.232831,-0.091239,-0.153556,-0.126065,...,-0.099872,0.273970,-0.048746,-0.073946,-0.160822,0.387989,-0.157689,-0.365438,0.027586,-0.076929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.126805,-0.157234,0.147893,0.072082,0.165811,0.132364,-0.074539,-0.142090,-0.020093,-0.245621,...,-0.077936,0.364318,-0.137421,-0.094633,-0.068587,0.311153,-0.099652,-0.256199,-0.058486,-0.065221
0,0.110293,0.084347,0.113860,0.235922,0.140378,0.093828,-0.228333,-0.119690,-0.112826,-0.212017,...,-0.085178,0.262552,-0.084069,-0.034056,-0.119824,0.280386,-0.196844,-0.355833,0.008985,-0.087785
0,0.070759,0.020390,0.284672,0.151575,0.171471,0.144294,-0.215269,-0.037508,-0.096271,-0.269813,...,-0.117576,0.196936,-0.111194,0.060687,-0.249338,0.211172,-0.127526,-0.284816,0.046580,-0.081126
0,0.098257,-0.098556,0.137465,0.132955,0.144681,0.106873,-0.107304,-0.148732,-0.064684,-0.268796,...,-0.098630,0.335183,-0.160518,-0.079548,-0.097329,0.254282,-0.119071,-0.334934,-0.048336,-0.069200


In [37]:
X_train_cbow = X_train_cbow.reset_index().drop(['index'], axis=1).fillna(0)
X_train_cbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.103715,-0.002669,0.153950,0.237088,0.150675,0.039473,-0.195056,-0.123852,-0.135551,-0.243535,...,-0.040841,0.295382,-0.165185,-0.095019,-0.135421,0.270145,-0.136805,-0.360417,-0.013511,-0.042050
1,0.073068,0.107424,0.124951,0.210843,0.136943,0.112223,-0.216901,-0.158670,-0.104263,-0.230354,...,-0.075750,0.237339,-0.120083,-0.047526,-0.119075,0.238800,-0.165567,-0.363928,0.031739,-0.097591
2,0.121525,0.039875,0.136009,0.191058,0.122848,0.136994,-0.186814,-0.126776,-0.084429,-0.229106,...,-0.112223,0.311797,-0.094857,-0.041932,-0.116575,0.279989,-0.198768,-0.341706,-0.029827,-0.073104
3,0.152348,0.075494,0.128314,0.248046,0.166745,0.046800,-0.235776,-0.084590,-0.143277,-0.183683,...,-0.076028,0.267752,-0.061788,-0.067676,-0.124432,0.360069,-0.151870,-0.352549,0.040473,-0.085711
4,0.176599,0.079705,0.089254,0.251721,0.144600,0.004873,-0.232831,-0.091239,-0.153556,-0.126065,...,-0.099872,0.273970,-0.048746,-0.073946,-0.160822,0.387989,-0.157689,-0.365438,0.027586,-0.076929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9925,0.126805,-0.157234,0.147893,0.072082,0.165811,0.132364,-0.074539,-0.142090,-0.020093,-0.245621,...,-0.077936,0.364318,-0.137421,-0.094633,-0.068587,0.311153,-0.099652,-0.256199,-0.058486,-0.065221
9926,0.110293,0.084347,0.113860,0.235922,0.140378,0.093828,-0.228333,-0.119690,-0.112826,-0.212017,...,-0.085178,0.262552,-0.084069,-0.034056,-0.119824,0.280386,-0.196844,-0.355833,0.008985,-0.087785
9927,0.070759,0.020390,0.284672,0.151575,0.171471,0.144294,-0.215269,-0.037508,-0.096271,-0.269813,...,-0.117576,0.196936,-0.111194,0.060687,-0.249338,0.211172,-0.127526,-0.284816,0.046580,-0.081126
9928,0.098257,-0.098556,0.137465,0.132955,0.144681,0.106873,-0.107304,-0.148732,-0.064684,-0.268796,...,-0.098630,0.335183,-0.160518,-0.079548,-0.097329,0.254282,-0.119071,-0.334934,-0.048336,-0.069200


In [38]:
X_test_cbow = pd.DataFrame(X_test.apply(sent_vect, model=cbow).sort_index())

In [39]:
X_test_cbow = convert_df(X_test_cbow)
X_test_cbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.229052,0.089740,0.195986,0.265431,-0.006031,-0.010653,-0.091910,-0.126556,-0.249633,-0.145820,...,-0.066169,0.306685,-0.134145,-0.200603,-0.191874,0.297304,-0.026670,-0.255334,-0.036733,0.005425
0,0.069430,-0.016187,0.193671,0.233152,0.144982,0.044482,-0.200227,-0.113926,-0.131688,-0.268310,...,-0.028204,0.257530,-0.203193,-0.077521,-0.164237,0.217363,-0.109929,-0.338992,-0.015274,-0.029334
0,0.164029,0.073729,0.144673,0.222471,0.151144,0.078541,-0.209050,-0.093368,-0.121834,-0.187861,...,-0.097796,0.302909,-0.067645,-0.084195,-0.140072,0.347354,-0.148250,-0.330508,0.011487,-0.069974
0,0.074931,0.029382,0.127711,0.208167,0.120668,0.134064,-0.213340,-0.148618,-0.077489,-0.244585,...,-0.066669,0.267883,-0.112311,-0.022620,-0.113588,0.231845,-0.207471,-0.338523,-0.015166,-0.079058
0,0.030049,-0.043330,0.274949,0.144535,0.054198,0.091007,-0.146495,-0.136082,-0.129442,-0.326819,...,0.005426,0.216887,-0.231952,-0.051964,-0.182232,0.119065,-0.097366,-0.258316,-0.035582,-0.008138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.027757,0.028976,0.198936,0.190184,0.178006,0.120154,-0.207207,-0.115679,-0.092229,-0.283381,...,-0.062589,0.212767,-0.179297,-0.023615,-0.166009,0.138644,-0.155425,-0.330208,0.025486,-0.065843
0,0.142575,0.101029,0.101807,0.252169,0.131004,0.077271,-0.228520,-0.096394,-0.122143,-0.169309,...,-0.079083,0.282786,-0.079120,-0.071414,-0.115208,0.318972,-0.201874,-0.366205,0.018585,-0.067642
0,0.078959,0.019989,0.203190,0.179240,0.113979,0.122934,-0.208469,-0.128541,-0.106271,-0.281634,...,-0.051083,0.260463,-0.142652,-0.033458,-0.138051,0.238075,-0.165277,-0.309097,-0.020612,-0.055424
0,0.117153,0.073279,0.147497,0.249551,0.136645,0.073572,-0.225872,-0.101775,-0.137082,-0.216418,...,-0.062271,0.247699,-0.100108,-0.056029,-0.131267,0.286019,-0.156764,-0.355000,0.022938,-0.074697


In [40]:
X_test_cbow = X_test_cbow.reset_index().drop(['index'], axis=1).fillna(0)
X_test_cbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.229052,0.089740,0.195986,0.265431,-0.006031,-0.010653,-0.091910,-0.126556,-0.249633,-0.145820,...,-0.066169,0.306685,-0.134145,-0.200603,-0.191874,0.297304,-0.026670,-0.255334,-0.036733,0.005425
1,0.069430,-0.016187,0.193671,0.233152,0.144982,0.044482,-0.200227,-0.113926,-0.131688,-0.268310,...,-0.028204,0.257530,-0.203193,-0.077521,-0.164237,0.217363,-0.109929,-0.338992,-0.015274,-0.029334
2,0.164029,0.073729,0.144673,0.222471,0.151144,0.078541,-0.209050,-0.093368,-0.121834,-0.187861,...,-0.097796,0.302909,-0.067645,-0.084195,-0.140072,0.347354,-0.148250,-0.330508,0.011487,-0.069974
3,0.074931,0.029382,0.127711,0.208167,0.120668,0.134064,-0.213340,-0.148618,-0.077489,-0.244585,...,-0.066669,0.267883,-0.112311,-0.022620,-0.113588,0.231845,-0.207471,-0.338523,-0.015166,-0.079058
4,0.030049,-0.043330,0.274949,0.144535,0.054198,0.091007,-0.146495,-0.136082,-0.129442,-0.326819,...,0.005426,0.216887,-0.231952,-0.051964,-0.182232,0.119065,-0.097366,-0.258316,-0.035582,-0.008138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3305,0.027757,0.028976,0.198936,0.190184,0.178006,0.120154,-0.207207,-0.115679,-0.092229,-0.283381,...,-0.062589,0.212767,-0.179297,-0.023615,-0.166009,0.138644,-0.155425,-0.330208,0.025486,-0.065843
3306,0.142575,0.101029,0.101807,0.252169,0.131004,0.077271,-0.228520,-0.096394,-0.122143,-0.169309,...,-0.079083,0.282786,-0.079120,-0.071414,-0.115208,0.318972,-0.201874,-0.366205,0.018585,-0.067642
3307,0.078959,0.019989,0.203190,0.179240,0.113979,0.122934,-0.208469,-0.128541,-0.106271,-0.281634,...,-0.051083,0.260463,-0.142652,-0.033458,-0.138051,0.238075,-0.165277,-0.309097,-0.020612,-0.055424
3308,0.117153,0.073279,0.147497,0.249551,0.136645,0.073572,-0.225872,-0.101775,-0.137082,-0.216418,...,-0.062271,0.247699,-0.100108,-0.056029,-0.131267,0.286019,-0.156764,-0.355000,0.022938,-0.074697


## Training SVM (Supervised only, TF-IDF)

In [41]:
clf = SVC()

In [42]:
params = {'kernel': ['linear', 'rbf', 'sigmoid'], 'C': [0.001, 0.05, 0.01, 0.1, 1, 5, 10, 100], 'gamma': [0.01, 0.1, 1, 5, 10, 100]}

In [43]:
clf.fit(X_train_tf, y_train)

  return f(*args, **kwargs)


SVC()

In [44]:
clf.score(X_test_tf, y_test)

0.66797583081571

In [64]:
gsc1 = GridSearchCV(clf, param_grid=params, n_jobs=-1)

In [65]:
grid_result1 = gsc.fit(X_train_tf, y_train)

  return f(*args, **kwargs)


In [66]:
best_params1 = grid_result.best_params_
best_params1

{'C': 1, 'gamma': 0.01, 'kernel': 'linear'}

In [67]:
grid_result1.best_score_

0.6715005035246727

In [68]:
clf = SVC(C=best_params['C'], gamma=best_params['gamma'], kernel=best_params['kernel'], probability = True)
clf

SVC(C=1, gamma=0.01, kernel='linear', probability=True)

## Train SVM (Self-Training)

In [50]:
label_prop_model = LabelPropagation()

In [53]:
label_prop_model.fit(X_train_tf.todense(), y_train_unlab)

  return f(*args, **kwargs)


LabelPropagation()

In [54]:
label_prop_model.score(X_test_tf, y_test)

0.6685800604229607

## Train SVM (CBOW W2V)

In [55]:
clf_w = SVC()

In [56]:
clf_w.fit(X_train_cbow, y_train)

  return f(*args, **kwargs)


SVC()

In [57]:
clf_w.score(X_test_cbow, y_test)

0.6676737160120846

In [60]:
gsc = GridSearchCV(clf_w, param_grid=params, n_jobs=-1)

In [61]:
grid_result = gsc.fit(X_train_tf, y_train)

  return f(*args, **kwargs)


In [62]:
best_params2 = grid_result.best_params_
best_params2

{'C': 1, 'gamma': 0.01, 'kernel': 'linear'}

In [63]:
grid_result.best_score_

0.6715005035246727

In [69]:
clf_w = SVC(C=best_params['C'], gamma=best_params['gamma'], kernel=best_params['kernel'], probability = True)
clf_w

SVC(C=1, gamma=0.01, kernel='linear', probability=True)

## Train SVM (Self-Training, W2V)

In [70]:
label_prop_model2 = LabelPropagation()

In [71]:
label_prop_model2.fit(X_train_tf.todense(), y_train_unlab)

  return f(*args, **kwargs)


LabelPropagation()

In [72]:
label_prop_model2.score(X_test_tf, y_test)

0.6685800604229607