In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cybersaksham/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cybersaksham/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
stop_words = set(stopwords.words('english'))
documents = {}
for i in range(1, 6):
    file_name = f"document{i}.txt"
    with open(f"data/{file_name}", 'r') as file:
        documents[file_name] = file.read()

In [4]:
def tokenize(sentence):
    words = sentence.split()
    tokens = [''.join(c for c in word if c.isalnum()) for word in words]
    return [token for token in tokens if token]

In [5]:
def remove_stopwords(document):
    words = tokenize(document)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words

In [6]:
# StopWords
processed_documents = {}
for doc in documents.keys():
    result = remove_stopwords(documents[doc])
    print(f"Preprocessed words from {doc}: ", end="")
    print(result)
    processed_documents[doc] = result

Preprocessed words from document1.txt: ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']
Preprocessed words from document2.txt: ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur', 'adipiscing', 'elit', 'Sed', 'eget', 'suscipit', 'quam', 'Duis', 'cursus', 'volutpat', 'erat', 'feugiat', 'est', 'pretium', 'id', 'Ut', 'eget', 'eleifend', 'velit']
Preprocessed words from document3.txt: ['Python', 'popular', 'programming', 'language', 'used', 'various', 'purposes', 'web', 'development', 'data', 'analysis', 'machine', 'learning']
Preprocessed words from document4.txt: ['computer', 'science', 'algorithm', 'stepbystep', 'procedure', 'solve', 'problem', 'implemented', 'programming', 'languages', 'like', 'Python', 'Java', 'C']
Preprocessed words from document5.txt: ['internet', 'global', 'network', 'connects', 'millions', 'computers', 'devices', 'across', 'world', 'allows', 'users', 'access', 'information', 'communicate', 'share', 'data']


In [7]:
words_set = set()
for doc in processed_documents.keys():
    words_set = words_set.union(set(processed_documents[doc]))
words_set

{'C',
 'Duis',
 'Java',
 'Lorem',
 'Python',
 'Sed',
 'Ut',
 'access',
 'across',
 'adipiscing',
 'algorithm',
 'allows',
 'amet',
 'analysis',
 'brown',
 'communicate',
 'computer',
 'computers',
 'connects',
 'consectetur',
 'cursus',
 'data',
 'development',
 'devices',
 'dog',
 'dolor',
 'eget',
 'eleifend',
 'elit',
 'erat',
 'est',
 'feugiat',
 'fox',
 'global',
 'id',
 'implemented',
 'information',
 'internet',
 'ipsum',
 'jumps',
 'language',
 'languages',
 'lazy',
 'learning',
 'like',
 'machine',
 'millions',
 'network',
 'popular',
 'pretium',
 'problem',
 'procedure',
 'programming',
 'purposes',
 'quam',
 'quick',
 'science',
 'share',
 'sit',
 'solve',
 'stepbystep',
 'suscipit',
 'used',
 'users',
 'various',
 'velit',
 'volutpat',
 'web',
 'world'}

In [8]:
# Term Frequency Matrix
def term_frequency_matrix(docs):
    tf = pd.DataFrame(
        np.zeros((len(docs), len(words_set))),
        columns=list(words_set), index=docs
    )
    for doc in docs.keys():
        for word in docs[doc]:
            tf[word][doc] = tf[word][doc] + 1 / len(docs[doc])
    return tf

term_frequency_matrix(processed_documents)

Unnamed: 0,connects,data,languages,Duis,learning,stepbystep,access,quam,across,brown,...,information,allows,consectetur,solve,devices,implemented,elit,sit,problem,cursus
document1.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
document2.txt,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.041667,0.0,0.0,...,0.0,0.0,0.041667,0.0,0.0,0.0,0.041667,0.041667,0.0,0.041667
document3.txt,0.0,0.076923,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
document4.txt,0.0,0.0,0.071429,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.071429,0.0,0.071429,0.0,0.0,0.071429,0.0
document5.txt,0.0625,0.0625,0.0,0.0,0.0,0.0,0.0625,0.0,0.0625,0.0,...,0.0625,0.0625,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0


In [9]:
# Inverse Document Frequency Matrix
def idf_matrix(docs):
    idf = pd.DataFrame(
        np.zeros((len(words_set), 1)),
        columns=["IDF"], index=list(words_set)
    )
    for word in words_set:
        k = 0
        for doc in docs.keys():
            if word in docs[doc]:
                k += 1
        idf["IDF"][word] = np.log10(len(docs.keys()) / k)
    return idf

idf_matrix(processed_documents)

Unnamed: 0,IDF
connects,0.69897
data,0.39794
languages,0.69897
Duis,0.69897
learning,0.69897
...,...
implemented,0.69897
elit,0.69897
sit,0.69897
problem,0.69897


In [10]:
# TF IDF Matrix
def tf_idf_matrix(docs):
    tfidf = term_frequency_matrix(docs)
    idf = idf_matrix(docs)
    for w in words_set:
        for doc in docs.keys():
            tfidf[w][doc] *= idf["IDF"][w]
    return tfidf

tf_idf_matrix(processed_documents)

Unnamed: 0,connects,data,languages,Duis,learning,stepbystep,access,quam,across,brown,...,information,allows,consectetur,solve,devices,implemented,elit,sit,problem,cursus
document1.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116495,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
document2.txt,0.0,0.0,0.0,0.029124,0.0,0.0,0.0,0.029124,0.0,0.0,...,0.0,0.0,0.029124,0.0,0.0,0.0,0.029124,0.029124,0.0,0.029124
document3.txt,0.0,0.030611,0.0,0.0,0.053767,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
document4.txt,0.0,0.0,0.049926,0.0,0.0,0.049926,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.049926,0.0,0.049926,0.0,0.0,0.049926,0.0
document5.txt,0.043686,0.024871,0.0,0.0,0.0,0.0,0.043686,0.0,0.043686,0.0,...,0.043686,0.043686,0.0,0.0,0.043686,0.0,0.0,0.0,0.0,0.0
