In [1]:
import numpy as np
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.semi_supervised import SelfTrainingClassifier

from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import pickle

### Load models

In [2]:
with open ("tfidf.pk", 'rb') as fin:
    tfidf = pickle.load(fin)

In [3]:
tfidf

TfidfVectorizer(max_df=5, ngram_range=(1, 3), stop_words='english')

In [4]:
cbow = Word2Vec.load("cbow.model")

In [5]:
cbow

<gensim.models.word2vec.Word2Vec at 0x2553f28b0a0>

## Import data

In [6]:
df = pd.read_csv("processed_train.csv")

In [7]:
df

Unnamed: 0.1,Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,0,86426,"['ask', 'native', 'american', 'take']",OFF,UNT,
1,1,90194,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF,TIN,IND
2,2,16820,"['amazon', 'investigating', 'chinese', 'employ...",NOT,,
3,3,62688,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF,UNT,
4,4,43605,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT,,
...,...,...,...,...,...,...
13235,13235,95338,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF,TIN,IND
13236,13236,67210,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT,,
13237,13237,82921,"['report', 'garbage', 'dont', 'give', 'crap']",OFF,TIN,OTH
13238,13238,27429,['pussy'],OFF,UNT,


## Removing Redundant Axes

In [8]:
df = df.drop(['Unnamed: 0', 'subtask_b', 'subtask_c', 'id'], axis=1)

In [9]:
df

Unnamed: 0,tweet,subtask_a
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


## Renaming Columns

In [10]:
df = df.rename(columns={'subtask_a': 'Offensive'})

In [11]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


## Converting Offensive to Numerical Value

In [12]:
def off(cls):
    if cls =='OFF':
        return 1
    elif cls == 'NOT':
        return 0

In [13]:
df["Offensive"] = df["Offensive"].apply(off)

In [14]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",1
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",1
2,"['amazon', 'investigating', 'chinese', 'employ...",0
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",1
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",0
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",1
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",0
13237,"['report', 'garbage', 'dont', 'give', 'crap']",1
13238,['pussy'],1


In [15]:
df['tweet']

0                    ['ask', 'native', 'american', 'take']
1        ['go', 'home', '’', 'drunk', 'maga', 'trump', ...
2        ['amazon', 'investigating', 'chinese', 'employ...
3        ['someone', 'shouldve', 'taken', 'piece', 'shi...
4        ['obama', 'wanted', 'liberal', 'amp', 'illegal...
                               ...                        
13235    ['sometimes', 'get', 'strong', 'vibe', 'people...
13236    ['benidorm', '✅', 'creamfields', '✅', 'maga', ...
13237        ['report', 'garbage', 'dont', 'give', 'crap']
13238                                            ['pussy']
13239    ['spanishrevenge', 'v', 'justice', 'human', 'r...
Name: tweet, Length: 13240, dtype: object

## Splitting into train and test

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['Offensive'], stratify=df['Offensive'], random_state=0)

In [17]:
X_train

3759             ['friend', 'father', 'safe', 'talk', '”']
1623                                   ['president', 'im']
8650     ['depending', 'government', 'politician', 'fur...
7897     ['im', 'sure', 'really', 'low', 'carb', 'diet'...
365                   ['welcome', 'why', 'james', 'cryin']
                               ...                        
11365    ['nra', 'supported', 'gun', 'control', 'reagan...
8409     ['quite', 'good', 'faking', 'must', 'done', 'n...
7883                                             ['thank']
10443    ['abundantly', 'clear', 'lack', 'common', 'sen...
10972                 ['there', 'brexit', '👇', '🏻', 'url']
Name: tweet, Length: 9930, dtype: object

In [18]:
X_test

9240     ['’', 'still', 'listening', '—', 'hear', 'mumb...
13059            ['well', 'born', '“', '”', 'dude', 'url']
11361    ['surprised', 'know', 'liberal', 'sacrifice', ...
3809     ['volient', 'action', 'done', 'defend', 'one',...
3565     ['🛑', 'truthfeed', 'news', '🛑', '👉', 'fbi', 'd...
                               ...                        
1126                                 ['bird', 'as', 'url']
7276     ['divert', 'defamewe', 'voted', 'get', 'thing'...
1426     ['alumnus', 'standing', 'christine', 'blasey',...
12266    ['wow', 'didnt', 'look', 'permanently', 'suspe...
10325                      ['thank', 'daughter', 'future']
Name: tweet, Length: 3310, dtype: object

In [19]:
y_train

3759     0
1623     0
8650     0
7897     0
365      0
        ..
11365    0
8409     1
7883     0
10443    0
10972    0
Name: Offensive, Length: 9930, dtype: int64

In [20]:
y_test

9240     0
13059    0
11361    0
3809     0
3565     1
        ..
1126     1
7276     0
1426     1
12266    1
10325    0
Name: Offensive, Length: 3310, dtype: int64

In [21]:
rng = np.random.RandomState(42)
random_unlabeled_points = rng.rand(y_train.shape[0]) < 0.3

In [22]:
random_unlabeled_points

array([False, False, False, ..., False, False, False])

In [23]:
y_train[random_unlabeled_points] = -1

In [24]:
y_train[random_unlabeled_points]

365     -1
2751    -1
7722    -1
8023    -1
7274    -1
        ..
10558   -1
6368    -1
7833    -1
9870    -1
11365   -1
Name: Offensive, Length: 3032, dtype: int64

In [25]:
y_train

3759     0
1623     0
8650     0
7897     0
365     -1
        ..
11365   -1
8409     1
7883     0
10443    0
10972    0
Name: Offensive, Length: 9930, dtype: int64

### TF-IDF

In [26]:
X_train_tf = tfidf.transform(X_train.sort_index())
print(X_train_tf)

  (0, 90335)	0.5384430828382877
  (0, 8544)	0.5958519306604624
  (0, 8543)	0.5958519306604624
  (1, 61615)	0.5
  (1, 61614)	0.5
  (1, 39779)	0.5
  (1, 39778)	0.5
  (2, 145820)	0.31916827891130445
  (2, 145819)	0.31916827891130445
  (2, 109653)	0.2884172110522567
  (2, 93482)	0.31916827891130445
  (2, 93481)	0.31916827891130445
  (2, 75129)	0.31916827891130445
  (2, 64184)	0.31916827891130445
  (2, 64183)	0.31916827891130445
  (2, 4902)	0.31916827891130445
  (2, 4901)	0.31916827891130445
  (3, 75838)	0.7071067811865476
  (3, 71905)	0.7071067811865476
  (4, 141686)	0.44398266100968137
  (4, 131741)	0.41907298524281483
  (4, 120388)	0.4637545829882441
  (4, 78480)	0.44398266100968137
  (4, 67452)	0.4637545829882441
  (5, 63427)	0.7071067811865476
  :	:
  (9929, 123888)	0.1765599555837432
  (9929, 123887)	0.1765599555837432
  (9929, 123883)	0.16903241883392575
  (9929, 123882)	0.16903241883392575
  (9929, 123881)	0.3273830930277344
  (9929, 119405)	0.1765599555837432
  (9929, 119403)	0.169

In [27]:
X_test_tf = tfidf.transform(X_test.sort_index())

### CBOW

In [28]:
#Sentence vectoriser - Average of sentence for each vector
def sent_vect(sent, model):
    sent = eval(sent)
    vec = np.zeros(32)
    num = 0
    for w in sent:
        try:
            temp_vec = model.wv.get_vector(w)
            #print(temp_vec)
            vec = np.add(vec, temp_vec)
            num += 1
        except:
            pass
        
    return vec / np.sqrt(vec.dot(vec))

In [29]:
X_train_cbow = X_train.apply(sent_vect, model=cbow).sort_index()
print(X_train_cbow)

0        [0.11678230874736859, 0.04701858112529256, 0.1...
1        [0.023887964134799274, -0.04221032997250825, 0...
4        [0.023089552048990755, -0.036891745280832305, ...
5        [0.08365669444110241, 0.024382291167563585, 0....
7        [0.1692174577116111, -0.028866439854166506, 0....
                               ...                        
13232    [0.05910713569141994, 0.03033006832724931, 0.2...
13235    [0.1772960903524207, 0.06816052902725968, 0.19...
13236    [-0.07846772515660165, -0.09172202196583056, 0...
13237    [0.1720133490472096, 0.0603833914465248, 0.166...
13239    [0.029200375665037092, -0.027191909181750278, ...
Name: tweet, Length: 9930, dtype: object


In [30]:
X_test_cbow = X_test.apply(sent_vect, model=cbow).sort_index()

## Model fitting

In [38]:
svc = SVC(probability=True, gamma="auto")
self_training_model = SelfTrainingClassifier(svc, criterion='threshold')

In [40]:
self_training_model.fit(X_train_tf, y_train)

SelfTrainingClassifier(base_estimator=SVC(gamma='auto', probability=True))

In [41]:
self_training_model.score(X_test_tf, y_test)

0.6676737160120846