# TF-IDF FROM SCRATCH

#### Steps:
* 1-) Word to index mapping
* 2-) Count Vectorizer (TF)
* 3-) IDF
* 4-) TF-IDF

In [49]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize

In [50]:
df = pd.read_csv(r'data\bbc_text_cls.csv')
print(df.shape)
df.head(1)

(2225, 2)


Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business


# 1-) Word to index mapping

#### pseudo-code
* Create a giant string by combining every data
* normalize them
* tokenize them
* use list(set(list())) and get unique elements map.

In [51]:
# 1-Normalization
#lower case
df['text'] = df['text'].str.lower()
#next line symbol
df['text'] = df['text'].str.replace(r'\n', ' ', regex=True)
#punctation
df['text'] = df['text'].str.replace(r'[^\w\s]', '', regex=True)
#numbers if exists
df['text'] = df['text'].str.replace(r'\d', '', regex=True)

In [52]:
#2-create a giant string

all_data_combined = ''

for text in df['text']:
    all_data_combined += text

print(len(all_data_combined))
print(all_data_combined[10:50])

4852665
oost time warner profit  quarterly profi


In [53]:
# 3-Tokenization
all_data_combined_list = all_data_combined.split()
all_tokens = list(set(all_data_combined_list)) #unique tokens

print(len(all_tokens))
all_tokens[:10]

33517


['nflstyle',
 'removing',
 'insurgency',
 'wayasian',
 'charitys',
 'froze',
 'refuses',
 'tatton',
 'picasa',
 'sienna']

In [54]:
# 4-token into dictionary

word_index_map = {}

for index, token in enumerate(all_tokens):
    word_index_map[token] = index

print(list(word_index_map.items())[0:3])

[('nflstyle', 0), ('removing', 1), ('insurgency', 2)]


# 2-) Count Vectorizer
* create data_count X len(all_token) sized zero matrix  
* +1 if the word exists for each layer.

In [55]:
len(df)

2225

In [56]:
df['text'] = df['text'].apply(lambda x : x.split())

In [57]:
tf = np.zeros((len(df['text']),len(all_tokens))) #zero matrix with list for easy use.

tf.shape

(2225, 33517)

In [58]:
i = 0
for text in df['text']:
    for token in text:
        if token in word_index_map:
            #word_index_map[token] #get index number
            tf[i][word_index_map[token]] += 1
    i += 1

In [65]:
for i in range(len(tf)):
    for val in tf[i]:
        if val > 100:
            print(val)

204.0
169.0
245.0
143.0
124.0
129.0
190.0
106.0
145.0
105.0
113.0
133.0
112.0


# 3-) IDF

 $IDF(t) = \log\left(\frac{N}{N(t)}\right)$ 

In [69]:
N = len(df)
N_t = (tf > 0).sum(axis=0)
N_t

array([ 1,  8,  1, ..., 15,  1,  1], shape=(33517,))

In [72]:
import math

idf = []

for n_t in N_t:
    if n_t != 0:
        idf.append(math.log(N/n_t))
    else:
        idf.append(0)

# 4-) TF-IDF

$$TF\text{-}IDF = TF \times IDF$$

In [None]:
tf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(2225, 33517))

In [79]:
print(len(idf))
idf[:10]

33517


[7.707512194600341,
 5.6280706529205045,
 7.707512194600341,
 0,
 6.608899905932231,
 7.014365014040395,
 5.5102876172641215,
 7.707512194600341,
 7.707512194600341,
 7.707512194600341]

In [80]:
tf[0]

array([0., 0., 0., ..., 0., 0., 0.], shape=(33517,))

In [81]:
tf[0][0]

np.float64(0.0)

In [82]:
tf[0][0] / idf[0]

np.float64(0.0)

In [86]:
len(tf_idf)

2225

In [103]:
a = np.array([1,2,3])
b = np.array([1,2,3])

a*b

array([1, 4, 9])

In [101]:
tf_idf = tf * idf

In [102]:
tf_idf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(2225, 33517))

In [106]:
for val in tf_idf[0]:
    if val > 10:
        print(val)

15.42768851141641
53.952585362202385
10.285125674277607
10.479663437471826
11.327059685628045
16.383476978785133
24.997309967490654
15.415024389200681
12.6424356669609
17.589287262869576
15.415024389200681
12.19614856433248
