# Tf-Idf Model



Calculates the term frequency or the number of occurences of a word in a document, and the idf of a word which is the Log(Number of Documents/Number of documents the word appears in. On multiplying the two, one gets Tf-Idf. Tf-Idf is similar to how relevant a word is to a documents class as it calculates the number of times in occurs in a particular document and the number of documents it occurs in as well.

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
import numpy as np
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.svm import SVC
from sklearn.semi_supervised import SelfTrainingClassifier

## Importing Data

In [2]:
df = pd.read_csv("processed_train.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,0,86426,"['ask', 'native', 'american', 'take']",OFF,UNT,
1,1,90194,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF,TIN,IND
2,2,16820,"['amazon', 'investigating', 'chinese', 'employ...",NOT,,
3,3,62688,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF,UNT,
4,4,43605,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT,,
...,...,...,...,...,...,...
13235,13235,95338,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF,TIN,IND
13236,13236,67210,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT,,
13237,13237,82921,"['report', 'garbage', 'dont', 'give', 'crap']",OFF,TIN,OTH
13238,13238,27429,['pussy'],OFF,UNT,


## Removing Redundant Axes

In [4]:
df = df.drop(['Unnamed: 0', 'subtask_b', 'subtask_c', 'id'], axis=1)

In [5]:
df

Unnamed: 0,tweet,subtask_a
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


## Renaming Columns

In [6]:
df = df.rename(columns={'subtask_a': 'Offensive'})

In [7]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


## Converting Offensive to Numerical Value

In [8]:
def off(cls):
    if cls =='OFF':
        return 1
    return 0

In [9]:
df['Offensive'] = df['Offensive'].apply(off)

In [10]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",1
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",1
2,"['amazon', 'investigating', 'chinese', 'employ...",0
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",1
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",0
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",1
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",0
13237,"['report', 'garbage', 'dont', 'give', 'crap']",1
13238,['pussy'],1


In [11]:
df['tweet'][0]

"['ask', 'native', 'american', 'take']"

## Train Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['Offensive'], stratify=df['Offensive'], shuffle=0)

In [13]:
X_train

2918               ['antifa', 'brown', 'shirt', 'special']
10643    ['’', 'embarrassment', 'republican', 'party', ...
322      ['stratum', 'data', 'ai', 'data', 'hairball', ...
2800     ['wassamatta', 'starting', 'wet', 'pant', 'wor...
9825                                    ['well', 'dumped']
                               ...                        
7313                             ['lmaooo', 'wise', 'ily']
8012     ['know', 'look', 'outside', 'box', 'like', 'ru...
3865     ['sir', 'governor', 'house', 'lahore', 'used',...
10554    ['going', 'deflect', 'away', 'nearly', 'dead',...
11747    ['historic', 'moniker', 'tory', 'stupid', 'par...
Name: tweet, Length: 9930, dtype: object

In [14]:
y_train

2918     0
10643    1
322      1
2800     1
9825     0
        ..
7313     0
8012     1
3865     0
10554    1
11747    0
Name: Offensive, Length: 9930, dtype: int64

## Tf-Idf Model

In [15]:
vect = TfidfVectorizer(analyzer='word', stop_words='english')

In [16]:
X_train_vect = vect.fit_transform(X_train)

In [17]:
X_train_vect

<9930x14397 sparse matrix of type '<class 'numpy.float64'>'
	with 89196 stored elements in Compressed Sparse Row format>

In [18]:
vect.get_feature_names()

['aa',
 'aaa',
 'aaah',
 'aaay',
 'aalayah',
 'aaron',
 'aarp',
 'aasertions',
 'ab',
 'aba',
 'ababzhah',
 'aback',
 'abandon',
 'abbott',
 'abc',
 'abetterway',
 'abetting',
 'abhorrent',
 'abhorres',
 'abi',
 'abide',
 'abiding',
 'abigot',
 'ability',
 'abject',
 'able',
 'ableg',
 'abnormal',
 'aboard',
 'aboilish',
 'abolish',
 'abolition',
 'abominable',
 'abomination',
 'abort',
 'aborting',
 'abortion',
 'abortionbecause',
 'abortionnot',
 'abortionwar',
 'abortive',
 'abound',
 'abramoff',
 'abroad',
 'absence',
 'absentee',
 'abso',
 'absofuckinglutely',
 'absolute',
 'absolutecriminals',
 'absolutely',
 'absolutist',
 'absurd',
 'abt',
 'abundance',
 'abundantly',
 'abuse',
 'abused',
 'abuser',
 'abusing',
 'abusive',
 'abysmal',
 'abyss',
 'ac',
 'aca',
 'academia',
 'academic',
 'academy',
 'accent',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'accepting',
 'accepts',
 'access',
 'accessory',
 'accident',
 'accidental',
 'accidentally',
 'accomplish',
 'accomp

## Classifying Model

In [19]:
clf = LogisticRegression()

In [20]:
C = list(np.arange(0, 1, 0.05))

C = [float(i) for i in C]

C = C[1:]

C

[0.05,
 0.1,
 0.15000000000000002,
 0.2,
 0.25,
 0.30000000000000004,
 0.35000000000000003,
 0.4,
 0.45,
 0.5,
 0.55,
 0.6000000000000001,
 0.65,
 0.7000000000000001,
 0.75,
 0.8,
 0.8500000000000001,
 0.9,
 0.9500000000000001]

In [21]:
scores = dict()

In [22]:
for i in C:
    clf = LogisticRegression(C= i, max_iter=1000)
    clf.fit(X_train_vect, y_train)
    scores[i] = clf.score(vect.transform(X_test), y_test)
    
scores

{0.05: 0.6782477341389728,
 0.1: 0.6939577039274925,
 0.15000000000000002: 0.7084592145015106,
 0.2: 0.7169184290030212,
 0.25: 0.7229607250755287,
 0.30000000000000004: 0.7308157099697885,
 0.35000000000000003: 0.7359516616314199,
 0.4: 0.7410876132930514,
 0.45: 0.7453172205438067,
 0.5: 0.7474320241691843,
 0.55: 0.7510574018126889,
 0.6000000000000001: 0.7525679758308157,
 0.65: 0.7540785498489426,
 0.7000000000000001: 0.7555891238670694,
 0.75: 0.7598187311178247,
 0.8: 0.7595166163141994,
 0.8500000000000001: 0.7595166163141994,
 0.9: 0.760725075528701,
 0.9500000000000001: 0.7616314199395771}

In [23]:
best_C = max(scores, key=scores.get)

print(best_C)

0.9500000000000001


In [24]:
clf = LogisticRegression(C=best_C)

In [25]:
clf.fit(X_train_vect, y_train)

LogisticRegression(C=0.9500000000000001)

In [26]:
clf.score(vect.transform(X_test), y_test)

0.7616314199395771

In [27]:
features = np.array(vect.get_feature_names())

In [28]:
coefs = clf.coef_[0].argsort()

In [29]:
print("Smallest Coefs \n{}".format(features[coefs[:10]]))
print("Largest Coefs \n{}".format(features[coefs[-11:-1]]))

Smallest Coefs 
['antifa' 'thank' 'beautiful' 'new' 'best' 'love' 'thanks' 'url' 'brexit'
 'welcome']
Largest Coefs 
['sick' 'nigga' 'disgusting' 'liar' 'suck' 'fucking' 'idiot' 'stupid'
 'bitch' 'fuck']


## Tf-Idf with Bigrams and Trigrams

Adding max df of 5 to ignore very popular words (words that appear in over 50% of documents)

In [30]:
vect_gram = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range = (1, 3), max_df=5)

In [31]:
X_train_gram = vect_gram.fit_transform(X_train)

In [32]:
X_train_gram

<9930x153859 sparse matrix of type '<class 'numpy.float64'>'
	with 170431 stored elements in Compressed Sparse Row format>

In [33]:
vect_gram.get_feature_names()

['aa',
 'aa mr',
 'aa mr bean',
 'aa sary',
 'aa sary liberal',
 'aaa',
 'aaa aaay',
 'aaa aaay eer',
 'aaa exciting',
 'aaa exciting majority',
 'aaah',
 'aaah thank',
 'aaah thank sm',
 'aaay',
 'aaay eer',
 'aaay eer url',
 'aalayah',
 'aalayah express',
 'aalayah express proud',
 'aaron',
 'aaron hernandez',
 'aaron hernandez know',
 'aaron rodgers',
 'aaron rodgers win',
 'aarp',
 'aarp stop',
 'aarp stop biased',
 'aasertions',
 'aasertions voting',
 'aasertions voting amp',
 'ab',
 'ab confirm',
 'ab confirm judge',
 'ab dumb',
 'ab dumb shane',
 'ab lot',
 'ab lot defined',
 'ab player',
 'ab player today',
 'ab winning',
 'ab winning rugby',
 'aba',
 'aba leadership',
 'aba leadership champion',
 'ababzhah',
 'ababzhah talking',
 'ababzhah talking sex',
 'aback',
 'abandon',
 'abandon brexit',
 'abandon brexit liberal',
 'abandon conservative',
 'abandon conservative want',
 'abbott',
 'abbott meeting',
 'abbott meeting solving',
 'abc',
 'abc amp',
 'abc amp damn',
 'abc let'

In [34]:
clf_gram = LogisticRegression()

In [35]:
scores_gram = dict()

In [36]:
for i in C:
    clf_gram = LogisticRegression(C= i, max_iter=1000)
    clf_gram.fit(X_train_vect, y_train)
    scores_gram[i] = clf_gram.score(vect.transform(X_test), y_test)
    
scores_gram

{0.05: 0.6782477341389728,
 0.1: 0.6939577039274925,
 0.15000000000000002: 0.7084592145015106,
 0.2: 0.7169184290030212,
 0.25: 0.7229607250755287,
 0.30000000000000004: 0.7308157099697885,
 0.35000000000000003: 0.7359516616314199,
 0.4: 0.7410876132930514,
 0.45: 0.7453172205438067,
 0.5: 0.7474320241691843,
 0.55: 0.7510574018126889,
 0.6000000000000001: 0.7525679758308157,
 0.65: 0.7540785498489426,
 0.7000000000000001: 0.7555891238670694,
 0.75: 0.7598187311178247,
 0.8: 0.7595166163141994,
 0.8500000000000001: 0.7595166163141994,
 0.9: 0.760725075528701,
 0.9500000000000001: 0.7616314199395771}

In [37]:
best_C_gram = max(scores, key=scores.get)

print(best_C_gram)

0.9500000000000001


In [38]:
clf_gram = LogisticRegression(C=best_C)

In [39]:
clf_gram.fit(X_train_vect, y_train)

LogisticRegression(C=0.9500000000000001)

In [40]:
clf_gram.score(vect.transform(X_test), y_test)

0.7616314199395771

In [41]:
features_gram = np.array(vect_gram.get_feature_names())

In [42]:
coefs_gram = clf_gram.coef_[0].argsort()

In [43]:
print("Smallest Coefs \n{}".format(features[coefs_gram[:10]]))
print("Largest Coefs \n{}".format(features[coefs_gram[-11:-1]]))

Smallest Coefs 
['antifa' 'thank' 'beautiful' 'new' 'best' 'love' 'thanks' 'url' 'brexit'
 'welcome']
Largest Coefs 
['sick' 'nigga' 'disgusting' 'liar' 'suck' 'fucking' 'idiot' 'stupid'
 'bitch' 'fuck']
