In [9]:
import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import pickle

## Load Models

### TF-IDF

In [11]:
with open ("tfidf.pk", 'rb') as fin:
    tfidf = pickle.load(fin)

In [12]:
tfidf

TfidfVectorizer(max_df=5, ngram_range=(1, 3), stop_words='english')

### Word2Vec CBOW

In [47]:
with open ("w2v_cbow.pk", 'rb') as fin:
    cbow = pickle.load(fin)

In [48]:
cbow

array([-0.16476315, -0.14831376,  0.17662199, -0.29303157, -0.17556453,
        0.17328742,  0.143924  , -0.17163116,  0.1071699 , -0.28073603,
       -0.33374226,  0.25144255,  0.12185895, -0.00150752,  0.12539929,
       -0.01245129,  0.14597881,  0.12934837, -0.1204912 ,  0.01213035,
        0.10884514,  0.04246545,  0.03035891,  0.15162921, -0.23360568,
        0.07388248,  0.00781983, -0.40507412, -0.07809478,  0.07285368,
       -0.26993006,  0.05880246], dtype=float32)

### Word2Vec SkipGram

In [16]:
with open ("w2v_sg.pk", "rb") as fin:
    skg = pickle.load(fin)

In [17]:
skg

array([ 0.10540247,  0.00649637, -0.04158486, -0.0870658 ,  0.04231866,
       -0.02794513, -0.05232182,  0.02245027, -0.07803168,  0.07128473,
       -0.08157565,  0.00467052,  0.05193534,  0.00040218,  0.01184358,
        0.068593  ,  0.142206  ,  0.06369126, -0.04586295,  0.03145951,
       -0.02454194,  0.03166866,  0.08151402, -0.11906867,  0.03597011,
       -0.06184277,  0.00555149, -0.07806904, -0.0264233 ,  0.09112785,
        0.03857002, -0.0733909 ], dtype=float32)

## Importing Data

In [96]:
df = pd.read_csv("processed_train.csv")

In [97]:
df

Unnamed: 0.1,Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,0,86426,"['ask', 'native', 'american', 'take']",OFF,UNT,
1,1,90194,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF,TIN,IND
2,2,16820,"['amazon', 'investigating', 'chinese', 'employ...",NOT,,
3,3,62688,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF,UNT,
4,4,43605,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT,,
...,...,...,...,...,...,...
13235,13235,95338,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF,TIN,IND
13236,13236,67210,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT,,
13237,13237,82921,"['report', 'garbage', 'dont', 'give', 'crap']",OFF,TIN,OTH
13238,13238,27429,['pussy'],OFF,UNT,


### Removing Redundant Axes

In [98]:
df = df.drop(['Unnamed: 0', 'subtask_b', 'subtask_c', 'id'], axis=1)

In [99]:
df

Unnamed: 0,tweet,subtask_a
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


### Renaming Columns

In [100]:
df = df.rename(columns={'subtask_a': 'Offensive'})

In [101]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


### Converting Offensive to Numerical Value

In [102]:
def off(cls):
    if cls =='OFF':
        return 1
    return 0

In [103]:
df["Offensive"] = df["Offensive"].apply(off)

In [104]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",1
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",1
2,"['amazon', 'investigating', 'chinese', 'employ...",0
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",1
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",0
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",1
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",0
13237,"['report', 'garbage', 'dont', 'give', 'crap']",1
13238,['pussy'],1


In [105]:
df['tweet']

0                    ['ask', 'native', 'american', 'take']
1        ['go', 'home', '’', 'drunk', 'maga', 'trump', ...
2        ['amazon', 'investigating', 'chinese', 'employ...
3        ['someone', 'shouldve', 'taken', 'piece', 'shi...
4        ['obama', 'wanted', 'liberal', 'amp', 'illegal...
                               ...                        
13235    ['sometimes', 'get', 'strong', 'vibe', 'people...
13236    ['benidorm', '✅', 'creamfields', '✅', 'maga', ...
13237        ['report', 'garbage', 'dont', 'give', 'crap']
13238                                            ['pussy']
13239    ['spanishrevenge', 'v', 'justice', 'human', 'r...
Name: tweet, Length: 13240, dtype: object

## Fitting Word Models on Data

In [106]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['Offensive'], stratify=df['Offensive'], random_state=0)

### TF-IDF

In [107]:
X_train_tf = tfidf.transform(X_train)

In [108]:
X_test_tf = tfidf.transform(X_test)

### Sentence Vectoriser

Finding the average vector for a given document

In [109]:
def sent_vect(sent, model):
    vec = np.zeros(400)
    num = 0
    for w in sent:
        try:
            vec = np.add(vec, model[w])
            num += 1
        except:
            pass
        
        
    print(num)
    return vec / np.sqrt(vec.dot(vec))

### CBOW

In [112]:
X_train_cbow = pd.eval(X_train).apply(sent_vect, model=cbow)

ValueError: unknown type object

In [56]:
X_train_cbow

3759     [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...
1623     [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...
8650     [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...
7897     [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...
365      [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...
                               ...                        
11365    [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...
8409     [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...
7883     [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...
10443    [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...
10972    [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...
Name: tweet, Length: 9930, dtype: object

In [44]:
X_train

3759             ['friend', 'father', 'safe', 'talk', '”']
1623                                   ['president', 'im']
8650     ['depending', 'government', 'politician', 'fur...
7897     ['im', 'sure', 'really', 'low', 'carb', 'diet'...
365                   ['welcome', 'why', 'james', 'cryin']
                               ...                        
11365    ['nra', 'supported', 'gun', 'control', 'reagan...
8409     ['quite', 'good', 'faking', 'must', 'done', 'n...
7883                                             ['thank']
10443    ['abundantly', 'clear', 'lack', 'common', 'sen...
10972                 ['there', 'brexit', '👇', '🏻', 'url']
Name: tweet, Length: 9930, dtype: object

In [72]:
type(df["tweet"][0])

list

In [95]:
sent_vect(df["tweet"][0], cbow)

0


  return vec / np.sqrt(vec.dot(vec))


array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na