In [2]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

## Load the documents

In [3]:
documents = [
    "apple apples apple apple a banana an banana ",
    "apple car a car"
]

### Tokenize


In [4]:
tokens = [doc.split() for doc in documents]
print(tokens)

[['apple', 'apples', 'apple', 'apple', 'a', 'banana', 'an', 'banana'], ['apple', 'car', 'a', 'car']]


In [5]:
# Initialize the stemmer
stemmer = PorterStemmer()

# Apply stemming to the tokens
stemmed = [
    [stemmer.stem(token) for token in doc]
    for doc in tokens
]
print(stemmed)

[['appl', 'appl', 'appl', 'appl', 'a', 'banana', 'an', 'banana'], ['appl', 'car', 'a', 'car']]


In [6]:
# remove stop-words

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))

bows = [
    [token for token in doc if token.lower() not in stop_words]
    for doc in stemmed
]
bows

[['appl', 'appl', 'appl', 'appl', 'banana', 'banana'], ['appl', 'car', 'car']]

## Count unique words in each document, i.e. Term Frequency (TF)

In [9]:
# Create the collection vocabulary
vocabulary = set(bows[0]).union(set(bows[1]))
print(vocabulary)

{'car', 'banana', 'appl'}


In [10]:
# Initialize the dictionaries to store Term Frequencies:
TFa = dict.fromkeys(vocabulary, 0)
TFb = dict.fromkeys(vocabulary, 0)

print(f'TFa:{TFa}')

TFa:{'car': 0, 'banana': 0, 'appl': 0}


In [12]:
# Calculate term counts:
for word in vocabulary:
    TFa[word] = bows[0].count(word)
    TFb[word] = bows[1].count(word)

print(f'countsA: {TFa}')
print(f'countsB: {TFb}')

countsA: {'car': 0, 'banana': 2, 'appl': 4}
countsB: {'car': 2, 'banana': 0, 'appl': 1}


In [13]:
counts_df = pd.DataFrame([TFa, TFb])
counts_df

Unnamed: 0,car,banana,appl
0,0,2,4
1,2,0,1


In [15]:
# Calculate Term Frequnecies

# Calculate the total number of terms in each document (sum of each row)
total_terms_per_doc = counts_df.sum(axis=1)

# Calculate the adjusted term frequency (TF) by dividing each count by the total number of terms in that document
TF_df = counts_df.div(total_terms_per_doc, axis=0)
TF_df

Unnamed: 0,car,banana,appl
0,0.0,0.333333,0.666667
1,0.666667,0.0,0.333333


### Calculate IDF :
$$
IDF(t) = \log \left( \frac{N}{1 + df(t)} \right)
$$

In [16]:
# Number of documents (rows in TF_df)
N = TF_df.shape[0]

# Calculate document frequency (df) for each term (number of non-zero entries per column)
df = (TF_df > 0).sum(axis=0)

# Calculate the Inverse Document Frequency (IDF) using the formula
IDF = np.log((1 + N) / (1 + df))
IDF

car       0.405465
banana    0.405465
appl      0.000000
dtype: float64

### Calculate TF-IDF (Term Frequency-Inverse Document Frequency)

TF*IDF

In [17]:
# Calculate the TF-IDF by multiplying TF values by the corresponding IDF values for each term
TF_IDF_df = TF_df * IDF

# Display the resulting TF-IDF DataFrame
TF_IDF_df

Unnamed: 0,car,banana,appl
0,0.0,0.135155,0.0
1,0.27031,0.0,0.0


In [18]:
TF_df

Unnamed: 0,car,banana,appl
0,0.0,0.333333,0.666667
1,0.666667,0.0,0.333333


# Represent as TF-IDF with sklearn

In [2]:
documents = [
    "apple apples apple apple a banana an banana ",
    "apple car a car"
]

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
TF_IDF_vectorized = vectorizer.fit_transform(documents)
TF_IDF_vectorized.toarray()

array([[0.69049203, 0.32348748, 0.64697497, 0.        ],
       [0.33517574, 0.        , 0.        , 0.94215562]])

In [6]:
vectorizer.vocabulary_

{'apple': 0, 'apples': 1, 'banana': 2, 'car': 3}