# Bag Of Words Model

Uses one-hot encoding to count number of occurences of a word in a given document. Disadvantage is that most words are not repeated in every document leading to a sparse matrix and also does not take into account the structure of a sentence, i.e., the order of words or semantics.

"Not bad, working good" and "Not good, working bad" would be treated as the same

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("processed_train.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,0,86426,"['ask', 'native', 'american', 'take']",OFF,UNT,
1,1,90194,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF,TIN,IND
2,2,16820,"['amazon', 'investigating', 'chinese', 'employ...",NOT,,
3,3,62688,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF,UNT,
4,4,43605,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT,,
...,...,...,...,...,...,...
13235,13235,95338,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF,TIN,IND
13236,13236,67210,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT,,
13237,13237,82921,"['report', 'garbage', 'dont', 'give', 'crap']",OFF,TIN,OTH
13238,13238,27429,['pussy'],OFF,UNT,


### Removing redundant axis

In [4]:
df = df.drop("Unnamed: 0", axis=1)

In [5]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,"['ask', 'native', 'american', 'take']",OFF,UNT,
1,90194,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF,TIN,IND
2,16820,"['amazon', 'investigating', 'chinese', 'employ...",NOT,,
3,62688,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF,UNT,
4,43605,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT,,
...,...,...,...,...,...
13235,95338,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF,TIN,IND
13236,67210,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT,,
13237,82921,"['report', 'garbage', 'dont', 'give', 'crap']",OFF,TIN,OTH
13238,27429,['pussy'],OFF,UNT,


### Removing Unnecessary Labels

In [6]:
df = df.drop(["subtask_b", "subtask_c", "id"], axis=1)

In [7]:
df

Unnamed: 0,tweet,subtask_a
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


### Renaming Column

In [8]:
df = df.rename(columns={"subtask_a": "Offensive"})

In [9]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


### Replacing Class with Numeric Value

In [10]:
def repl(off):
    if off == 'OFF':
        return 1
    return 0

In [11]:
df['Offensive'] = df['Offensive'].apply(repl)

In [12]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",1
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",1
2,"['amazon', 'investigating', 'chinese', 'employ...",0
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",1
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",0
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",1
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",0
13237,"['report', 'garbage', 'dont', 'give', 'crap']",1
13238,['pussy'],1


In [13]:
df['tweet']

0                    ['ask', 'native', 'american', 'take']
1        ['go', 'home', '’', 'drunk', 'maga', 'trump', ...
2        ['amazon', 'investigating', 'chinese', 'employ...
3        ['someone', 'shouldve', 'taken', 'piece', 'shi...
4        ['obama', 'wanted', 'liberal', 'amp', 'illegal...
                               ...                        
13235    ['sometimes', 'get', 'strong', 'vibe', 'people...
13236    ['benidorm', '✅', 'creamfields', '✅', 'maga', ...
13237        ['report', 'garbage', 'dont', 'give', 'crap']
13238                                            ['pussy']
13239    ['spanishrevenge', 'v', 'justice', 'human', 'r...
Name: tweet, Length: 13240, dtype: object

### Train-Test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['Offensive'], stratify=df['Offensive'], shuffle=0)

In [15]:
X_train

754                        ['awesome', 'thankful', 'know']
6255                                  ['god', 'help', 'u']
3203                                     ['ball', 'court']
8671     ['suppose', 'antitrumps', 'accepted', 'loss', ...
5988            ['another', 'billionaire', 'buffoon', '😎']
                               ...                        
10127             ['prayer', '❤', '️', '🙏', '🏻', 'missed']
1272            ['suggest', 'using', 'handcuff', 'remove']
4167     ['awww', 'kaise', 'bache', 'ki', 'tarah', 'hol...
242      ['dont', 'forget', 'telling', 'primarily', 'bl...
1144                             ['look', 'cranky', 'url']
Name: tweet, Length: 9930, dtype: object

In [16]:
y_train

754      0
6255     0
3203     1
8671     0
5988     0
        ..
10127    0
1272     1
4167     0
242      0
1144     0
Name: Offensive, Length: 9930, dtype: int64

## BoW Model

In [17]:
vect = CountVectorizer(lowercase=False)

In [18]:
vect = vect.fit(X_train)

In [19]:
len(vect.get_feature_names())

14699

In [20]:
X_train_vect = vect.transform(X_train)

X_train_vect

<9930x14699 sparse matrix of type '<class 'numpy.int64'>'
	with 99138 stored elements in Compressed Sparse Row format>

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
C = list(np.arange(0, 1, 0.05))

C = [float(i) for i in C]

C = C[1:]

C

[0.05,
 0.1,
 0.15000000000000002,
 0.2,
 0.25,
 0.30000000000000004,
 0.35000000000000003,
 0.4,
 0.45,
 0.5,
 0.55,
 0.6000000000000001,
 0.65,
 0.7000000000000001,
 0.75,
 0.8,
 0.8500000000000001,
 0.9,
 0.9500000000000001]

In [23]:
scores = dict()

In [24]:
for i in C:
    clf = LogisticRegression(C= i, max_iter=1000)
    clf.fit(X_train_vect, y_train)
    scores[i] = clf.score(vect.transform(X_test), y_test)
    
scores

{0.05: 0.7447129909365559,
 0.1: 0.7570996978851964,
 0.15000000000000002: 0.7586102719033233,
 0.2: 0.7601208459214501,
 0.25: 0.7622356495468278,
 0.30000000000000004: 0.7628398791540786,
 0.35000000000000003: 0.7628398791540786,
 0.4: 0.7625377643504532,
 0.45: 0.7616314199395771,
 0.5: 0.7610271903323262,
 0.55: 0.759214501510574,
 0.6000000000000001: 0.7604229607250755,
 0.65: 0.7586102719033233,
 0.7000000000000001: 0.7586102719033233,
 0.75: 0.7580060422960725,
 0.8: 0.7574018126888218,
 0.8500000000000001: 0.7580060422960725,
 0.9: 0.7589123867069486,
 0.9500000000000001: 0.7586102719033233}

In [25]:
best_C = max(scores, key=scores.get)

print(best_C)

0.30000000000000004


In [26]:
clf = LogisticRegression(C= 0.3, max_iter=1000)

In [27]:
clf.fit(X_train_vect, y_train)

LogisticRegression(C=0.3, max_iter=1000)

In [28]:
clf.score(vect.transform(X_test), y_test)

0.7628398791540786

In [29]:
features = np.array(vect.get_feature_names())

In [30]:
coefs = clf.coef_[0].argsort()

In [31]:
print("Smallest Coefs \n{}".format(features[coefs[:10]]))
print("Largest Coefs \n{}".format(features[coefs[-11:-1]]))

Smallest Coefs 
['best' 'brexit' 'thank' 'nike' 'lead' 'awesome' 'accuser' 'thanks'
 'excuse' 'action']
Largest Coefs 
['nigga' 'liar' 'fucked' 'suck' 'as' 'fucking' 'idiot' 'stupid' 'shit'
 'bitch']


## BoW with Bigrams and Trigrams

In [32]:
vect_gram = CountVectorizer(lowercase=False, ngram_range=(1, 3), min_df=5)

In [33]:
vect_gram = vect_gram.fit(X_train)

In [34]:
len(vect_gram.get_feature_names())

3862

In [35]:
vect_gram.get_feature_names()

['aaron',
 'ab',
 'abiding',
 'abiding citizen',
 'ability',
 'able',
 'abortion',
 'absolute',
 'absolutely',
 'abt',
 'abuse',
 'abuse power',
 'abused',
 'abuser',
 'accept',
 'acceptable',
 'accepting',
 'access',
 'accident',
 'accomplished',
 'according',
 'account',
 'accountable',
 'accusation',
 'accuse',
 'accused',
 'accuser',
 'accuses',
 'accusing',
 'achieve',
 'achievement',
 'acknowledge',
 'across',
 'act',
 'act like',
 'acting',
 'acting like',
 'action',
 'active',
 'actively',
 'activist',
 'activity',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'add',
 'added',
 'address',
 'administration',
 'admire',
 'admit',
 'admitted',
 'adopt',
 'adorable',
 'adult',
 'advantage',
 'advice',
 'advocate',
 'advocating',
 'af',
 'affair',
 'affect',
 'afford',
 'affordable',
 'afraid',
 'africa',
 'african',
 'ag',
 'age',
 'agency',
 'agenda',
 'agent',
 'aggressive',
 'ago',
 'agree',
 'agreed',
 'ah',
 'ahead',
 'ahhh',
 'aim',
 'aint',
 'air',
 'aka',
 

In [36]:
X_train_vect = vect_gram.transform(X_train)

X_train_vect

<9930x3862 sparse matrix of type '<class 'numpy.int64'>'
	with 90736 stored elements in Compressed Sparse Row format>

In [37]:
scores_gram = dict()

In [38]:
for i in C:
    clf = LogisticRegression(C= i, max_iter=1000)
    clf.fit(X_train_vect, y_train)
    scores_gram[i] = clf.score(vect_gram.transform(X_test), y_test)
    
scores_gram

{0.05: 0.7447129909365559,
 0.1: 0.7564954682779457,
 0.15000000000000002: 0.7574018126888218,
 0.2: 0.7628398791540786,
 0.25: 0.7625377643504532,
 0.30000000000000004: 0.7634441087613293,
 0.35000000000000003: 0.7616314199395771,
 0.4: 0.760725075528701,
 0.45: 0.7604229607250755,
 0.5: 0.7583081570996979,
 0.55: 0.7580060422960725,
 0.6000000000000001: 0.7574018126888218,
 0.65: 0.7564954682779457,
 0.7000000000000001: 0.7558912386706949,
 0.75: 0.7546827794561933,
 0.8: 0.7525679758308157,
 0.8500000000000001: 0.7501510574018126,
 0.9: 0.7507552870090635,
 0.9500000000000001: 0.7501510574018126}

In [39]:
best_C_gram = max(scores_gram, key=scores.get)

print(best_C_gram)

0.30000000000000004


In [40]:
clf = LogisticRegression(C= 0.3, max_iter=1000)

In [41]:
clf.fit(X_train_vect, y_train)

LogisticRegression(C=0.3, max_iter=1000)

In [42]:
clf.score(vect_gram.transform(X_test), y_test)

0.7634441087613293

In [43]:
features = np.array(vect_gram.get_feature_names())

In [44]:
coefs = clf.coef_[0].argsort()

In [45]:
print("Smallest Coefs \n{}".format(features[coefs[:10]]))
print("Largest Coefs \n{}".format(features[coefs[-11:-1]]))

Smallest Coefs 
['brexit' 'best' 'thank' 'nike' 'excuse' 'action' 'lead' 'awesome'
 'accuser' 'thanks']
Largest Coefs 
['liar' 'nigga' 'fucked' 'suck' 'as' 'fucking' 'idiot' 'stupid' 'bitch'
 'shit']


## SVM


In [46]:
from sklearn.svm import SVC

In [53]:
params = {'kernel': ['rbf', 'poly', 'sigmoid'], 'C': [0.05, 0.01, 0.1, 1, 5], 'gamma': [0.01, 0.1, 1, 5, 10,], 'degree': [1, 2, 3]}

In [54]:
clf = SVC()

In [55]:
clf.fit(X_train_vect, y_train)

SVC()

In [56]:
clf.score(vect_gram.transform(X_test), y_test)

0.7522658610271903

In [57]:
gsc = GridSearchCV(clf, param_grid=params, n_jobs=-1)

In [58]:
grid_result = gsc.fit(X_train_vect, y_train)

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.


In [None]:
best_params = grid_result.best_params_
best_params

In [None]:
grid_result.best_score_

In [None]:
clf = SVC(C=best_params['C'], gamma=best_params['gamma'], kernel=best_params['kernel'], probability = True, degree=best_params['degree'])
clf