# Bag Of Words Model

Uses one-hot encoding to count number of occurences of a word in a given document. Disadvantage is that most words are not repeated in every document leading to a sparse matrix and also does not take into account the structure of a sentence, i.e., the order of words or semantics.

"Not bad, working good" and "Not good, working bad" would be treated as the same

In [63]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [91]:
df = pd.read_csv("processed_train.csv")

In [92]:
df

Unnamed: 0.1,Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,0,86426,"['ask', 'native', 'american', 'take']",OFF,UNT,
1,1,90194,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF,TIN,IND
2,2,16820,"['amazon', 'investigating', 'chinese', 'employ...",NOT,,
3,3,62688,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF,UNT,
4,4,43605,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT,,
...,...,...,...,...,...,...
13235,13235,95338,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF,TIN,IND
13236,13236,67210,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT,,
13237,13237,82921,"['report', 'garbage', 'dont', 'give', 'crap']",OFF,TIN,OTH
13238,13238,27429,['pussy'],OFF,UNT,


### Removing redundant axis

In [93]:
df = df.drop("Unnamed: 0", axis=1)

In [94]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,"['ask', 'native', 'american', 'take']",OFF,UNT,
1,90194,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF,TIN,IND
2,16820,"['amazon', 'investigating', 'chinese', 'employ...",NOT,,
3,62688,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF,UNT,
4,43605,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT,,
...,...,...,...,...,...
13235,95338,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF,TIN,IND
13236,67210,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT,,
13237,82921,"['report', 'garbage', 'dont', 'give', 'crap']",OFF,TIN,OTH
13238,27429,['pussy'],OFF,UNT,


### Removing Unnecessary Labels

In [95]:
df = df.drop(["subtask_b", "subtask_c", "id"], axis=1)

In [96]:
df

Unnamed: 0,tweet,subtask_a
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


### Renaming Column

In [97]:
df = df.rename(columns={"subtask_a": "Offensive"})

In [98]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


### Replacing Class with Numeric Value

In [99]:
def repl(off):
    if off == 'OFF':
        return 1
    return 0

In [100]:
df['Offensive'] = df['Offensive'].apply(repl)

In [101]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",1
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",1
2,"['amazon', 'investigating', 'chinese', 'employ...",0
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",1
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",0
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",1
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",0
13237,"['report', 'garbage', 'dont', 'give', 'crap']",1
13238,['pussy'],1


In [102]:
df['tweet']

0                    ['ask', 'native', 'american', 'take']
1        ['go', 'home', '’', 'drunk', 'maga', 'trump', ...
2        ['amazon', 'investigating', 'chinese', 'employ...
3        ['someone', 'shouldve', 'taken', 'piece', 'shi...
4        ['obama', 'wanted', 'liberal', 'amp', 'illegal...
                               ...                        
13235    ['sometimes', 'get', 'strong', 'vibe', 'people...
13236    ['benidorm', '✅', 'creamfields', '✅', 'maga', ...
13237        ['report', 'garbage', 'dont', 'give', 'crap']
13238                                            ['pussy']
13239    ['spanishrevenge', 'v', 'justice', 'human', 'r...
Name: tweet, Length: 13240, dtype: object

### Train-Test Split

In [103]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['Offensive'], stratify=df['Offensive'], shuffle=0)

In [104]:
X_train

10485    ['chicago', 'record', 'number', 'homicide', 'g...
227      ['aha', 'yes', 'see', 'individual', 'sense', '...
446      ['yeah', 'think', 'saying', 'addiction', 'life...
4830     ['antikavanaugh', 'group', 'already', 'spinnin...
6141     ['hope', 'theyre', 'brushed', 'large', 'negati...
                               ...                        
8265               ['like', 'boob', 'mass', 'belly', 'xd']
4815     ['civilian', 'also', 'ill', 'admit', 'hard', '...
12445    ['good', 'thats', 'fine', 'gun', 'control', 'd...
11772                       ['oh', 'shit', 'stay', 'safe']
5486     ['twitter', 'social', 'medium', 'seems', 'make...
Name: tweet, Length: 9930, dtype: object

In [105]:
y_train

10485    1
227      0
446      0
4830     0
6141     1
        ..
8265     1
4815     0
12445    1
11772    0
5486     0
Name: Offensive, Length: 9930, dtype: int64

## BoW Model

In [106]:
vect = CountVectorizer(lowercase=False)

In [107]:
vect = vect.fit(X_train)

In [108]:
len(vect.get_feature_names())

14572

In [109]:
X_train_vect = vect.transform(X_train)

X_train_vect

<9930x14572 sparse matrix of type '<class 'numpy.int64'>'
	with 99161 stored elements in Compressed Sparse Row format>

In [110]:
from sklearn.linear_model import LogisticRegression

In [147]:
C = list(np.arange(0, 1, 0.05))

C = [float(i) for i in C]

C = C[1:]

C

[0.05,
 0.1,
 0.15000000000000002,
 0.2,
 0.25,
 0.30000000000000004,
 0.35000000000000003,
 0.4,
 0.45,
 0.5,
 0.55,
 0.6000000000000001,
 0.65,
 0.7000000000000001,
 0.75,
 0.8,
 0.8500000000000001,
 0.9,
 0.9500000000000001]

In [149]:
scores = dict()

In [150]:
for i in C:
    clf = LogisticRegression(C= i, max_iter=1000)
    clf.fit(X_train_vect, y_train)
    scores[i] = clf.score(vect.transform(X_test), y_test)
    
scores

{0.05: 0.7477341389728097,
 0.1: 0.7595166163141994,
 0.15000000000000002: 0.7601208459214501,
 0.2: 0.7640483383685801,
 0.25: 0.766465256797583,
 0.30000000000000004: 0.7673716012084593,
 0.35000000000000003: 0.7655589123867069,
 0.4: 0.7661631419939577,
 0.45: 0.7637462235649547,
 0.5: 0.7631419939577039,
 0.55: 0.7637462235649547,
 0.6000000000000001: 0.7646525679758308,
 0.65: 0.7652567975830815,
 0.7000000000000001: 0.7655589123867069,
 0.75: 0.7673716012084593,
 0.8: 0.7661631419939577,
 0.8500000000000001: 0.766465256797583,
 0.9: 0.766465256797583,
 0.9500000000000001: 0.7658610271903323}

In [151]:
best_C = max(scores, key=scores.get)

print(best_C)

0.30000000000000004


In [161]:
clf = LogisticRegression(C= 0.3, max_iter=1000)

In [162]:
clf.fit(X_train_vect, y_train)

LogisticRegression(C=0.3, max_iter=1000)

In [163]:
clf.score(vect.transform(X_test), y_test)

0.7673716012084593

In [164]:
features = np.array(vect.get_feature_names())

In [165]:
coefs = clf.coef_[0].argsort()

In [166]:
print("Smallest Coefs \n{}".format(features[coefs[:10]]))
print("Largest Coefs \n{}".format(features[coefs[-11:-1]]))

Smallest Coefs 
['thank' 'justice' 'lead' 'best' 'brexit' 'welcome' 'nike' 'action'
 'accuser' 'company']
Largest Coefs 
['nigga' 'suck' 'disgusting' 'liar' 'as' 'idiot' 'fucking' 'stupid'
 'bitch' 'shit']
