In [1]:
# bag of words - turns all documents into a single vector of words
import pandas as pd
import numpy as np 
import nltk 

from nltk import word_tokenize

In [2]:
# Download the punkt tokenizer for sentence splitting
nltk.download('punkt')

df = pd.read_csv('bbc_text_cls.csv')

df.head()

[nltk_data] Downloading package punkt to /Users/faa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [4]:
# create list of lists for each document (words are encoded as 0-based indexes)
idx = 0 
word2idx = {}
tokenized_docs = []
for doc in df['text']:
    words = word_tokenize(doc.lower())
    # int values that map to the word in word2idx
    doc_as_int = []
    for word in words: 
        if word not in word2idx: 
            # adds new word
            word2idx[word] = idx
            # assignes it a new index
            idx += 1

        # creates a list of the word indexes in the document
        doc_as_int.append(word2idx[word])

    # creates a list of lists (all documents as lists of word indexes)
    tokenized_docs.append(doc_as_int)

In [5]:
# reverse mapping, converts index to a word
idx2word = {v:k for k, v in word2idx.items()}

# number of rows in df (documents)
N = len(df['text'])

# number of unique words in the corpus
V = len(word2idx)

# term-frequency matrix, how many times a word appears in a document
tf = np.zeros((N, V))

# populating, i = document number, doc_as_int = list of words as indexes in the document
for i, doc_as_int in enumerate(tokenized_docs): 
    for j in doc_as_int: 
        # j = word index, i = document number
        tf[i, j] += 1


In [6]:
# compute idf 
document_freq = np.sum(tf > 0, axis=0) # for each column (word), sums all documents where a word appears
idf = np.log(N / document_freq) # (V,1) sized vector 

# compute tf-idf
tf_idf = tf * idf

In [10]:
# consistent results across runs
np.random.seed(123)

# pick a random document, show the top 5 terms with the highest tf-idf score
# random int
i = np.random.choice(N)

# df at the random index
row = df.iloc[i]

print('Label: ', row['labels'])
print('Text: ', row['text'].split('\n', 1)[0])
print('Top 5 terms: ')

scores = tf_idf[i]
# desc order, indices 
indices = (-scores).argsort()

for j in indices[:5]:
    print(idx2word[j], scores[j])

# higher score = more important (appears less often in the corpus)

Label:  politics
Text:  Blair prepares to name poll date
Top 5 terms: 
parliament 15.821087061651685
election 14.077320173345495
easter 13.217799811864461
dissolve 13.217799811864461
blair 12.572776718550651
