In [1]:
import nltk
import re
import heapq
import numpy as np
import pandas as pd

In [2]:
from content import paragraph

### Gets the top 100 frequent words

In [3]:
dataset = nltk.sent_tokenize(paragraph)
for i in range(len(dataset)):
    dataset[i] = dataset[i].lower()
    dataset[i] = re.sub(r'\W', ' ', dataset[i])
    dataset[i] = re.sub(r'\s+', ' ', dataset[i])

In [4]:
word2count = {}
for data in dataset:
    words = nltk.word_tokenize(data)
    for word in words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1

In [5]:
freq_words = heapq.nlargest(100, word2count, key=word2count.get)

### Let's build the TF-IDF model

1. TF (Term Frequency) = Occurrences of a word in a document / Words in that document

In [6]:
tf_matrix = {}
for word in freq_words:
    sen_tf = []
    for data in dataset:
        words = nltk.word_tokenize(data)
        frequency = 0
        for w in words:
            if word == w:
                frequency += 1
        tf_word = frequency/len(words)
        sen_tf.append(tf_word)
    tf_matrix[word] = sen_tf

In [8]:
tf_matrix['the']

[0.0,
 0.2,
 0.0,
 0.1,
 0.2,
 0.0,
 0.0,
 0.0,
 0.043478260869565216,
 0.0,
 0.1,
 0.06666666666666667,
 0.05263157894736842,
 0.0,
 0.05,
 0.10638297872340426,
 0.045454545454545456,
 0.0,
 0.0,
 0.0,
 0.0]

2. IDF (Inverse Document Frequency) = log(Documents / Documents containing the world)

In [9]:
word_idfs = {}
for word in freq_words:
    sen_count = 0
    for data in dataset:
        if word in nltk.word_tokenize(data):
            sen_count += 1
    word_idfs[word] = np.log(len(dataset)/(1 + sen_count))

In [10]:
word_idfs

{'the': 0.6466271649250525,
 'to': 0.5596157879354227,
 'you': 0.7419373447293773,
 'of': 1.0986122886681098,
 'for': 1.0986122886681098,
 'this': 0.7419373447293773,
 'thank': 0.8472978603872037,
 'and': 0.965080896043587,
 'i': 1.0986122886681098,
 'my': 1.4350845252893227,
 'all': 1.4350845252893227,
 'in': 1.6582280766035324,
 'be': 1.4350845252893227,
 'who': 1.9459101490553132,
 'world': 1.6582280766035324,
 'very': 1.6582280766035324,
 'have': 1.6582280766035324,
 'by': 1.6582280766035324,
 'we': 1.6582280766035324,
 'our': 1.6582280766035324,
 'is': 1.9459101490553132,
 'not': 1.6582280766035324,
 'people': 1.9459101490553132,
 'out': 1.9459101490553132,
 'so': 1.9459101490553132,
 'much': 1.9459101490553132,
 'year': 1.9459101490553132,
 'revenant': 1.9459101490553132,
 'was': 1.9459101490553132,
 'off': 1.9459101490553132,
 'tom': 1.9459101490553132,
 'your': 2.3513752571634776,
 'screen': 2.3513752571634776,
 'a': 1.9459101490553132,
 'entire': 1.9459101490553132,
 'would': 

3. TF * IDF calculation

In [11]:
tfidf_matrix = []
for word in tf_matrix.keys():
    tfidf = []
    for value in tf_matrix[word]:
        score = value * word_idfs[word]
        tfidf.append(score)
    tfidf_matrix.append(tfidf)   

In [12]:
X = np.asarray(tfidf_matrix)

In [13]:
X.shape

(100, 21)

In [14]:
df = pd.DataFrame(X.transpose(), columns=freq_words)

In [15]:
df.head()

Unnamed: 0,the,to,you,of,for,this,thank,and,i,my,...,parents,none,possible,without,friends,love,dearly,know,are,lastly
0,0.0,0.0,0.123656,0.0,0.0,0.0,0.141216,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.129325,0.111923,0.148387,0.0,0.0,0.0,0.16946,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.06218,0.164875,0.122068,0.0,0.082437,0.094144,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.064663,0.055962,0.0,0.0,0.0,0.074194,0.0,0.0,0.109861,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.129325,0.0,0.0,0.146482,0.0,0.0,0.0,0.064339,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
