In [1]:
import numpy as np
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.semi_supervised import SelfTrainingClassifier
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
import pandas as pd
import pickle

### Load models

In [2]:
with open ("tfidf.pk", 'rb') as fin:
    tfidf = pickle.load(fin)

In [3]:
tfidf

TfidfVectorizer(max_df=5, ngram_range=(1, 3), stop_words='english')

In [4]:
cbow = Word2Vec.load("cbow.model")

In [5]:
cbow

<gensim.models.word2vec.Word2Vec at 0x2aff81e0670>

## Import data

In [6]:
df = pd.read_csv("processed_train.csv")

In [7]:
df

Unnamed: 0.1,Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,0,86426,"['ask', 'native', 'american', 'take']",OFF,UNT,
1,1,90194,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF,TIN,IND
2,2,16820,"['amazon', 'investigating', 'chinese', 'employ...",NOT,,
3,3,62688,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF,UNT,
4,4,43605,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT,,
...,...,...,...,...,...,...
13235,13235,95338,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF,TIN,IND
13236,13236,67210,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT,,
13237,13237,82921,"['report', 'garbage', 'dont', 'give', 'crap']",OFF,TIN,OTH
13238,13238,27429,['pussy'],OFF,UNT,


## Removing Redundant Axes

In [8]:
df = df.drop(['Unnamed: 0', 'subtask_b', 'subtask_c', 'id'], axis=1)

In [9]:
df

Unnamed: 0,tweet,subtask_a
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


## Renaming Columns

In [10]:
df = df.rename(columns={'subtask_a': 'Offensive'})

In [11]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


## Converting Offensive to Numerical Value

In [12]:
def off(cls):
    if cls =='OFF':
        return 1
    elif cls == 'NOT':
        return 0

In [13]:
df["Offensive"] = df["Offensive"].apply(off)

In [14]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",1
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",1
2,"['amazon', 'investigating', 'chinese', 'employ...",0
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",1
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",0
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",1
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",0
13237,"['report', 'garbage', 'dont', 'give', 'crap']",1
13238,['pussy'],1


In [15]:
df['tweet']

0                    ['ask', 'native', 'american', 'take']
1        ['go', 'home', '’', 'drunk', 'maga', 'trump', ...
2        ['amazon', 'investigating', 'chinese', 'employ...
3        ['someone', 'shouldve', 'taken', 'piece', 'shi...
4        ['obama', 'wanted', 'liberal', 'amp', 'illegal...
                               ...                        
13235    ['sometimes', 'get', 'strong', 'vibe', 'people...
13236    ['benidorm', '✅', 'creamfields', '✅', 'maga', ...
13237        ['report', 'garbage', 'dont', 'give', 'crap']
13238                                            ['pussy']
13239    ['spanishrevenge', 'v', 'justice', 'human', 'r...
Name: tweet, Length: 13240, dtype: object

## Splitting into train and test

In [39]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['Offensive'], stratify=df['Offensive'], random_state=0)

In [40]:
X_train = X_train.reset_index().drop(['index'], axis=1)
X_train

Unnamed: 0,tweet
0,"['friend', 'father', 'safe', 'talk', '”']"
1,"['president', 'im']"
2,"['depending', 'government', 'politician', 'fur..."
3,"['im', 'sure', 'really', 'low', 'carb', 'diet'..."
4,"['welcome', 'why', 'james', 'cryin']"
...,...
9925,"['nra', 'supported', 'gun', 'control', 'reagan..."
9926,"['quite', 'good', 'faking', 'must', 'done', 'n..."
9927,['thank']
9928,"['abundantly', 'clear', 'lack', 'common', 'sen..."


In [41]:
X_train = X_train['tweet']

In [42]:
X_train

0               ['friend', 'father', 'safe', 'talk', '”']
1                                     ['president', 'im']
2       ['depending', 'government', 'politician', 'fur...
3       ['im', 'sure', 'really', 'low', 'carb', 'diet'...
4                    ['welcome', 'why', 'james', 'cryin']
                              ...                        
9925    ['nra', 'supported', 'gun', 'control', 'reagan...
9926    ['quite', 'good', 'faking', 'must', 'done', 'n...
9927                                            ['thank']
9928    ['abundantly', 'clear', 'lack', 'common', 'sen...
9929                 ['there', 'brexit', '👇', '🏻', 'url']
Name: tweet, Length: 9930, dtype: object

In [43]:
X_test = X_test.reset_index().drop(['index'], axis=1)
X_test

Unnamed: 0,tweet
0,"['’', 'still', 'listening', '—', 'hear', 'mumb..."
1,"['well', 'born', '“', '”', 'dude', 'url']"
2,"['surprised', 'know', 'liberal', 'sacrifice', ..."
3,"['volient', 'action', 'done', 'defend', 'one',..."
4,"['🛑', 'truthfeed', 'news', '🛑', '👉', 'fbi', 'd..."
...,...
3305,"['bird', 'as', 'url']"
3306,"['divert', 'defamewe', 'voted', 'get', 'thing'..."
3307,"['alumnus', 'standing', 'christine', 'blasey',..."
3308,"['wow', 'didnt', 'look', 'permanently', 'suspe..."


In [44]:
X_test = X_test['tweet']
X_test

0       ['’', 'still', 'listening', '—', 'hear', 'mumb...
1               ['well', 'born', '“', '”', 'dude', 'url']
2       ['surprised', 'know', 'liberal', 'sacrifice', ...
3       ['volient', 'action', 'done', 'defend', 'one',...
4       ['🛑', 'truthfeed', 'news', '🛑', '👉', 'fbi', 'd...
                              ...                        
3305                                ['bird', 'as', 'url']
3306    ['divert', 'defamewe', 'voted', 'get', 'thing'...
3307    ['alumnus', 'standing', 'christine', 'blasey',...
3308    ['wow', 'didnt', 'look', 'permanently', 'suspe...
3309                      ['thank', 'daughter', 'future']
Name: tweet, Length: 3310, dtype: object

In [45]:
y_train = y_train.reset_index().drop(['index'], axis=1)
y_train

Unnamed: 0,Offensive
0,0
1,0
2,0
3,0
4,0
...,...
9925,0
9926,1
9927,0
9928,0


In [46]:
y_test = y_test.reset_index().drop(['index'], axis=1)
y_test

Unnamed: 0,Offensive
0,0
1,0
2,0
3,0
4,1
...,...
3305,1
3306,0
3307,1
3308,1


In [47]:
rng = np.random.RandomState(42)
random_unlabeled_points = rng.rand(y_train.shape[0]) < 0.3

In [48]:
random_unlabeled_points

array([False, False, False, ..., False, False, False])

In [49]:
y_train_unlab = y_train.copy()

In [50]:
y_train_unlab[random_unlabeled_points] = -1

In [51]:
y_train_unlab[random_unlabeled_points]

Unnamed: 0,Offensive
4,-1
5,-1
6,-1
10,-1
13,-1
...,...
9916,-1
9917,-1
9918,-1
9920,-1


In [52]:
y_train

Unnamed: 0,Offensive
0,0
1,0
2,0
3,0
4,0
...,...
9925,0
9926,1
9927,0
9928,0


In [53]:
y_train_unlab

Unnamed: 0,Offensive
0,0
1,0
2,0
3,0
4,-1
...,...
9925,-1
9926,1
9927,0
9928,0


### TF-IDF

In [59]:
X_train_tf = tfidf.transform(X_train.sort_index())
print(X_train_tf)

  (0, 114895)	0.4472135954999579
  (0, 49843)	0.4472135954999579
  (0, 49842)	0.4472135954999579
  (0, 45902)	0.4472135954999579
  (0, 45901)	0.4472135954999579
  (1, 103513)	1.0
  (2, 90973)	0.6915408747548445
  (2, 35511)	0.7223373301604346
  (3, 129230)	0.18313486349026764
  (3, 129229)	0.18313486349026764
  (3, 108920)	0.18313486349026764
  (3, 108919)	0.18313486349026764
  (3, 89593)	0.18313486349026764
  (3, 89592)	0.18313486349026764
  (3, 82056)	0.18313486349026764
  (3, 82055)	0.18313486349026764
  (3, 82054)	0.3662697269805353
  (3, 64529)	0.18313486349026764
  (3, 62402)	0.18313486349026764
  (3, 62401)	0.18313486349026764
  (3, 38508)	0.18313486349026764
  (3, 38507)	0.18313486349026764
  (3, 36656)	0.18313486349026764
  (3, 36655)	0.18313486349026764
  (3, 36654)	0.16549027602137428
  :	:
  (9928, 109206)	0.17465468520308794
  (9928, 109205)	0.17465468520308794
  (9928, 108537)	0.17465468520308794
  (9928, 108534)	0.15447883340939958
  (9928, 102122)	0.17465468520308794
  

In [60]:
X_test_tf = tfidf.transform(X_test.sort_index())

### CBOW

In [30]:
#Sentence vectoriser - Average of sentence for each vector
def sent_vect(sent, model):
    sent = eval(sent)
    vec = np.zeros(32)
    num = 0
    for w in sent:
        try:
            temp_vec = model.wv.get_vector(w)
            #print(temp_vec)
            vec = np.add(vec, temp_vec)
            num += 1
        except:
            pass
        
    return vec / np.sqrt(vec.dot(vec))

In [54]:
X_train_cbow = X_train.apply(sent_vect, model=cbow).sort_index()
print(X_train_cbow)

0       [0.10371455167512132, -0.0026690812901859584, ...
1       [0.07306809735589292, 0.10742395934446738, 0.1...
2       [0.12152500412207404, 0.03987473932714874, 0.1...
3       [0.15234790567554005, 0.07549362942942803, 0.1...
4       [0.17659919052198297, 0.07970519122554563, 0.0...
                              ...                        
9925    [0.1268052961219268, -0.15723430540881764, 0.1...
9926    [0.11029270031188039, 0.08434713848718038, 0.1...
9927    [0.07075947084083557, 0.020389900997694718, 0....
9928    [0.09825704417810942, -0.09855605385106399, 0....
9929    [0.0029189210899187015, -0.04699727707605283, ...
Name: tweet, Length: 9930, dtype: object


In [55]:
X_test_cbow = X_test.apply(sent_vect, model=cbow).sort_index()

## Training SVM (Supervised only, TF-IDF)

In [56]:
clf = SVC()

In [66]:
params = {'kernel': ['linear', 'rbf', 'sigmoid'], 'C': [0.001, 0.05, 0.01, 0.1, 1, 5, 10, 100], 'gamma': [0.01, 0.1, 1, 5, 10, 100]}

In [67]:
clf.fit(X_train_tf, y_train)

  return f(*args, **kwargs)


SVC()

In [68]:
clf.score(X_test_tf, y_test)

0.6700906344410876

In [69]:
gsc = GridSearchCV(clf, param_grid=params, n_jobs=-1)

In [70]:
grid_result = gsc.fit(X_train_tf, y_train)

  return f(*args, **kwargs)


In [71]:
best_params = grid_result.best_params_
best_params

{'C': 5, 'gamma': 0.1, 'kernel': 'rbf'}

In [72]:
grid_result.best_score_

0.6719033232628399

In [73]:
clf = SVC(C=best_params['C'], gamma=best_params['gamma'], kernel=best_params['kernel'], probability = True)
clf

SVC(C=5, gamma=0.1, probability=True)

## Train SVM (Self-Training)

In [75]:
self_training_model = SelfTrainingClassifier(clf)

In [76]:
self_training_model.fit(X_train_tf, y_train_unlab)

  return f(*args, **kwargs)


SelfTrainingClassifier(base_estimator=SVC(C=5, gamma=0.1, probability=True))

In [77]:
self_training_model.score(X_test_tf, y_test)

0.6688821752265861