# Tf-Idf Model



Calculates the term frequency or the number of occurences of a word in a document, and the idf of a word which is the Log(Number of Documents/Number of documents the word appears in. On multiplying the two, one gets Tf-Idf. Tf-Idf is similar to how relevant a word is to a documents class as it calculates the number of times in occurs in a particular document and the number of documents it occurs in as well.

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
import numpy as np
from sklearn.linear_model import LogisticRegression

## Importing Data

In [7]:
df = pd.read_csv("processed_train.csv")

In [8]:
df

Unnamed: 0.1,Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,0,86426,"['ask', 'native', 'american', 'take']",OFF,UNT,
1,1,90194,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF,TIN,IND
2,2,16820,"['amazon', 'investigating', 'chinese', 'employ...",NOT,,
3,3,62688,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF,UNT,
4,4,43605,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT,,
...,...,...,...,...,...,...
13235,13235,95338,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF,TIN,IND
13236,13236,67210,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT,,
13237,13237,82921,"['report', 'garbage', 'dont', 'give', 'crap']",OFF,TIN,OTH
13238,13238,27429,['pussy'],OFF,UNT,


## Removing Redundant Axes

In [10]:
df = df.drop(['Unnamed: 0', 'subtask_b', 'subtask_c', 'id'], axis=1)

In [11]:
df

Unnamed: 0,tweet,subtask_a
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


## Renaming Columns

In [12]:
df = df.rename(columns={'subtask_a': 'Offensive'})

In [13]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",OFF
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",OFF
2,"['amazon', 'investigating', 'chinese', 'employ...",NOT
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",OFF
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",NOT
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",OFF
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",NOT
13237,"['report', 'garbage', 'dont', 'give', 'crap']",OFF
13238,['pussy'],OFF


## Converting Offensive to Numerical Value

In [14]:
def off(cls):
    if cls =='OFF':
        return 1
    return 0

In [15]:
df['Offensive'] = df['Offensive'].apply(off)

In [16]:
df

Unnamed: 0,tweet,Offensive
0,"['ask', 'native', 'american', 'take']",1
1,"['go', 'home', '’', 'drunk', 'maga', 'trump', ...",1
2,"['amazon', 'investigating', 'chinese', 'employ...",0
3,"['someone', 'shouldve', 'taken', 'piece', 'shi...",1
4,"['obama', 'wanted', 'liberal', 'amp', 'illegal...",0
...,...,...
13235,"['sometimes', 'get', 'strong', 'vibe', 'people...",1
13236,"['benidorm', '✅', 'creamfields', '✅', 'maga', ...",0
13237,"['report', 'garbage', 'dont', 'give', 'crap']",1
13238,['pussy'],1


In [17]:
df['tweet']

0                    ['ask', 'native', 'american', 'take']
1        ['go', 'home', '’', 'drunk', 'maga', 'trump', ...
2        ['amazon', 'investigating', 'chinese', 'employ...
3        ['someone', 'shouldve', 'taken', 'piece', 'shi...
4        ['obama', 'wanted', 'liberal', 'amp', 'illegal...
                               ...                        
13235    ['sometimes', 'get', 'strong', 'vibe', 'people...
13236    ['benidorm', '✅', 'creamfields', '✅', 'maga', ...
13237        ['report', 'garbage', 'dont', 'give', 'crap']
13238                                            ['pussy']
13239    ['spanishrevenge', 'v', 'justice', 'human', 'r...
Name: tweet, Length: 13240, dtype: object

## Train Test Split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['Offensive'], stratify=df['Offensive'], shuffle=0)

In [20]:
X_train

3532     ['bro', 'twinsie', 'resemblance', 'brother', '...
440      ['believe', 'correct', 'chico', 'wise', 'chihu...
11656    ['agree', 'amp', 'please', 'go', 'work', 'that...
3905     ['doubt', 'greatest', 'female', 'athlete', 'so...
6843     ['sure', 'lot', 'folk', 'arent', 'actually', '...
                               ...                        
7947     ['outrage', 'something', 'credible', 'instead'...
8099     ['name', 'one', 'democratic', 'leader', 'endor...
2387                              ['lmfaoo', '😭', 'bitch']
1675     ['’', 'remember', 'clothes', 'clothes', 'house...
10394    ['thank', 'america', 'respected', 'maga', 'kag...
Name: tweet, Length: 9930, dtype: object

In [21]:
y_train

3532     0
440      0
11656    0
3905     1
6843     0
        ..
7947     1
8099     0
2387     1
1675     1
10394    0
Name: Offensive, Length: 9930, dtype: int64

## Tf-Idf Model

In [25]:
vect = TfidfVectorizer(analyzer='word', stop_words='english')

In [26]:
X_train_vect = vect.fit_transform(X_train)

In [27]:
X_train_vect

<9930x14445 sparse matrix of type '<class 'numpy.float64'>'
	with 89415 stored elements in Compressed Sparse Row format>

In [28]:
vect.get_feature_names()

['aa',
 'aaa',
 'aaah',
 'aaahh',
 'aalayah',
 'aand',
 'aaron',
 'aasertions',
 'ab',
 'ababzhah',
 'aback',
 'abandon',
 'abbott',
 'abc',
 'abcnews',
 'abducted',
 'abetterway',
 'abhorrent',
 'abhorres',
 'abide',
 'abiding',
 'ability',
 'abject',
 'able',
 'ableg',
 'abnormal',
 'aboard',
 'abolish',
 'abolition',
 'abominable',
 'abomination',
 'aborting',
 'abortion',
 'abortionbecause',
 'abortionnot',
 'abortive',
 'abound',
 'abroad',
 'absentee',
 'abso',
 'absofuckinglutely',
 'absolute',
 'absolutecriminals',
 'absolutely',
 'absolutist',
 'absurd',
 'abt',
 'abundance',
 'abundantly',
 'abuse',
 'abused',
 'abuser',
 'abusing',
 'abusive',
 'abusuve',
 'abysmal',
 'ac',
 'aca',
 'academic',
 'academy',
 'acc',
 'accent',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'accepting',
 'accepts',
 'access',
 'accessory',
 'accident',
 'accidental',
 'accidentally',
 'accommodate',
 'accommodating',
 'accomplish',
 'accomplished',
 'accomplishment',
 'accord',
 'accord

## Classifying Model

In [29]:
clf = LogisticRegression()

In [30]:
C = list(np.arange(0, 1, 0.05))

C = [float(i) for i in C]

C = C[1:]

C

[0.05,
 0.1,
 0.15000000000000002,
 0.2,
 0.25,
 0.30000000000000004,
 0.35000000000000003,
 0.4,
 0.45,
 0.5,
 0.55,
 0.6000000000000001,
 0.65,
 0.7000000000000001,
 0.75,
 0.8,
 0.8500000000000001,
 0.9,
 0.9500000000000001]

In [31]:
scores = dict()

In [32]:
for i in C:
    clf = LogisticRegression(C= i, max_iter=1000)
    clf.fit(X_train_vect, y_train)
    scores[i] = clf.score(vect.transform(X_test), y_test)
    
scores

{0.05: 0.6758308157099698,
 0.1: 0.6948640483383686,
 0.15000000000000002: 0.7078549848942598,
 0.2: 0.7175226586102719,
 0.25: 0.7259818731117825,
 0.30000000000000004: 0.7317220543806646,
 0.35000000000000003: 0.7383685800604229,
 0.4: 0.7447129909365559,
 0.45: 0.7474320241691843,
 0.5: 0.7501510574018126,
 0.55: 0.7549848942598187,
 0.6000000000000001: 0.7577039274924471,
 0.65: 0.7598187311178247,
 0.7000000000000001: 0.7613293051359517,
 0.75: 0.7619335347432025,
 0.8: 0.7634441087613293,
 0.8500000000000001: 0.7646525679758308,
 0.9: 0.7652567975830815,
 0.9500000000000001: 0.7658610271903323}

In [33]:
best_C = max(scores, key=scores.get)

print(best_C)

0.9500000000000001


In [34]:
clf = LogisticRegression(C=best_C)

In [35]:
clf.fit(X_train_vect, y_train)

LogisticRegression(C=0.9500000000000001)

In [36]:
clf.score(vect.transform(X_test), y_test)

0.7658610271903323

In [37]:
features = np.array(vect.get_feature_names())

In [38]:
coefs = clf.coef_[0].argsort()

In [39]:
print("Smallest Coefs \n{}".format(features[coefs[:10]]))
print("Largest Coefs \n{}".format(features[coefs[-11:-1]]))

Smallest Coefs 
['antifa' 'thank' 'best' 'url' 'beautiful' 'conservative' 'new' 'thanks'
 'funny' 'love']
Largest Coefs 
['racist' 'sick' 'disgusting' 'liar' 'suck' 'idiot' 'bitch' 'fucking'
 'stupid' 'fuck']
