# Step 1: Prepare Corpus for TF-IDF

In this step, we will:

1. Collect documents.
2. Convert text to lowercase.
3. Remove punctuation.
4. Tokenize the text into words.


In [1]:
# Import necessary libraries
import string
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus (replace with your dataset later)
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = text.split()
    return tokens

# Preprocess the corpus
preprocessed_corpus = [preprocess_text(doc) for doc in corpus]

# Display the preprocessed corpus
print("Preprocessed Corpus:", preprocessed_corpus)

Preprocessed Corpus: [['this', 'is', 'the', 'first', 'document'], ['this', 'document', 'is', 'the', 'second', 'document'], ['and', 'this', 'is', 'the', 'third', 'one'], ['is', 'this', 'the', 'first', 'document']]


# Step 2: Compute Term Frequency (TF)

In this step, we will calculate the term frequency for each word in each document. Term frequency measures how often a word appears in a document, normalized by the total number of words in that document.


In [2]:
# Compute Term Frequency (TF)
def compute_tf(preprocessed_corpus):
    tf_list = []
    for doc in preprocessed_corpus:
        tf_dict = {}
        total_terms = len(doc)
        for term in doc:
            tf_dict[term] = tf_dict.get(term, 0) + 1
        # Normalize by total terms in the document
        for term in tf_dict:
            tf_dict[term] /= total_terms
        tf_list.append(tf_dict)
    return tf_list

# Calculate TF for the preprocessed corpus
tf_scores = compute_tf(preprocessed_corpus)

# Display the TF scores
for i, tf in enumerate(tf_scores):
    print(f"Document {i+1} TF:", tf)

Document 1 TF: {'this': 0.2, 'is': 0.2, 'the': 0.2, 'first': 0.2, 'document': 0.2}
Document 2 TF: {'this': 0.16666666666666666, 'document': 0.3333333333333333, 'is': 0.16666666666666666, 'the': 0.16666666666666666, 'second': 0.16666666666666666}
Document 3 TF: {'and': 0.16666666666666666, 'this': 0.16666666666666666, 'is': 0.16666666666666666, 'the': 0.16666666666666666, 'third': 0.16666666666666666, 'one': 0.16666666666666666}
Document 4 TF: {'is': 0.2, 'this': 0.2, 'the': 0.2, 'first': 0.2, 'document': 0.2}


# Step 3: Compute Inverse Document Frequency (IDF)

In this step, we will calculate the inverse document frequency for each word. IDF measures how important a word is by considering how many documents contain the word. Words that appear in many documents will have a lower IDF score.


In [3]:
# Compute Inverse Document Frequency (IDF)
import math

def compute_idf(preprocessed_corpus):
    idf_dict = {}
    total_docs = len(preprocessed_corpus)
    for doc in preprocessed_corpus:
        for term in set(doc):
            idf_dict[term] = idf_dict.get(term, 0) + 1
    # Calculate IDF for each term
    for term in idf_dict:
        idf_dict[term] = math.log(total_docs / idf_dict[term])
    return idf_dict

# Calculate IDF for the preprocessed corpus
idf_scores = compute_idf(preprocessed_corpus)

# Display the IDF scores
print("IDF Scores:", idf_scores)

IDF Scores: {'document': 0.28768207245178085, 'is': 0.0, 'this': 0.0, 'the': 0.0, 'first': 0.6931471805599453, 'second': 1.3862943611198906, 'and': 1.3862943611198906, 'one': 1.3862943611198906, 'third': 1.3862943611198906}


# Step 4: Compute TF-IDF Scores

In this step, we will calculate the TF-IDF scores for each word in each document. TF-IDF is computed by multiplying the term frequency (TF) of a word by its inverse document frequency (IDF).


In [4]:
# Compute TF-IDF Scores
def compute_tfidf(tf_scores, idf_scores):
    tfidf_list = []
    for doc_tf in tf_scores:
        tfidf_dict = {}
        for term, tf in doc_tf.items():
            tfidf_dict[term] = tf * idf_scores.get(term, 0)
        tfidf_list.append(tfidf_dict)
    return tfidf_list

# Calculate TF-IDF scores
tfidf_scores = compute_tfidf(tf_scores, idf_scores)

# Display the TF-IDF scores
for i, tfidf in enumerate(tfidf_scores):
    print(f"Document {i+1} TF-IDF:", tfidf)

Document 1 TF-IDF: {'this': 0.0, 'is': 0.0, 'the': 0.0, 'first': 0.13862943611198905, 'document': 0.05753641449035617}
Document 2 TF-IDF: {'this': 0.0, 'document': 0.09589402415059362, 'is': 0.0, 'the': 0.0, 'second': 0.23104906018664842}
Document 3 TF-IDF: {'and': 0.23104906018664842, 'this': 0.0, 'is': 0.0, 'the': 0.0, 'third': 0.23104906018664842, 'one': 0.23104906018664842}
Document 4 TF-IDF: {'is': 0.0, 'this': 0.0, 'the': 0.0, 'first': 0.13862943611198905, 'document': 0.05753641449035617}


# Step 5: Create TF-IDF Matrix

In this step, we will create a TF-IDF matrix where rows represent documents, columns represent terms, and the values are the TF-IDF scores. This matrix is useful for various text analysis tasks such as clustering and classification.


In [7]:
# Create TF-IDF Matrix
import pandas as pd

def create_tfidf_matrix(tfidf_scores):
    # Get all unique terms
    terms = set(term for doc in tfidf_scores for term in doc)
    # Create a DataFrame
    tfidf_matrix = pd.DataFrame(0, index=range(len(tfidf_scores)), columns=terms)
    for i, doc in enumerate(tfidf_scores):
        for term, score in doc.items():
            tfidf_matrix.at[i, term] = score
    return tfidf_matrix

# Generate the TF-IDF matrix
tfidf_matrix = create_tfidf_matrix(tfidf_scores)

# Display the TF-IDF matrix
print("TF-IDF Matrix:")
print(tfidf_matrix)

ValueError: columns cannot be a set

In [6]:
# Fix: Convert set to list for DataFrame columns
def create_tfidf_matrix(tfidf_scores):
    # Get all unique terms
    terms = list(set(term for doc in tfidf_scores for term in doc))  # Convert set to list
    # Create a DataFrame
    tfidf_matrix = pd.DataFrame(0, index=range(len(tfidf_scores)), columns=terms)
    for i, doc in enumerate(tfidf_scores):
        for term, score in doc.items():
            tfidf_matrix.at[i, term] = score
    return tfidf_matrix

# Regenerate the TF-IDF matrix with the fix
tfidf_matrix = create_tfidf_matrix(tfidf_scores)

# Display the corrected TF-IDF matrix
print("Corrected TF-IDF Matrix:")
print(tfidf_matrix)

Corrected TF-IDF Matrix:
   document    second  is  this  the       and       one     third     first
0  0.057536  0.000000   0     0    0  0.000000  0.000000  0.000000  0.138629
1  0.095894  0.231049   0     0    0  0.000000  0.000000  0.000000  0.000000
2  0.000000  0.000000   0     0    0  0.231049  0.231049  0.231049  0.000000
3  0.057536  0.000000   0     0    0  0.000000  0.000000  0.000000  0.138629


  tfidf_matrix.at[i, term] = score
  tfidf_matrix.at[i, term] = score
  tfidf_matrix.at[i, term] = score
  tfidf_matrix.at[i, term] = score
  tfidf_matrix.at[i, term] = score
  tfidf_matrix.at[i, term] = score


# Step 6: Explore TF-IDF Results

In this step, we will analyze the TF-IDF matrix to identify important and unique words in each document. High TF-IDF scores indicate terms that are important to a specific document, while low scores indicate common terms across documents.


In [8]:
# Analyze TF-IDF Results

def explore_tfidf_results(tfidf_matrix):
    # Find the top terms for each document
    for i, row in tfidf_matrix.iterrows():
        top_terms = row.sort_values(ascending=False).head(5)
        print(f"Document {i+1} Top Terms:")
        print(top_terms)
        print("\n")

# Explore the TF-IDF matrix
explore_tfidf_results(tfidf_matrix)

Document 1 Top Terms:
first       0.138629
document    0.057536
second      0.000000
is          0.000000
this        0.000000
Name: 0, dtype: float64


Document 2 Top Terms:
second      0.231049
document    0.095894
is          0.000000
this        0.000000
the         0.000000
Name: 1, dtype: float64


Document 3 Top Terms:
and         0.231049
one         0.231049
third       0.231049
document    0.000000
second      0.000000
Name: 2, dtype: float64


Document 4 Top Terms:
first       0.138629
document    0.057536
second      0.000000
is          0.000000
this        0.000000
Name: 3, dtype: float64




# Step 7: Download and Preprocess Dataset from UCI Sentiment Labelled Sentences

In this step, we will download the dataset from the UCI repository, load it into a DataFrame, and preprocess the text data to prepare it for TF-IDF analysis.


In [25]:
# Download and preprocess the UCI Sentiment Labelled Sentences dataset
import os
import pandas as pd
import requests

# URL of the dataset
dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment labelled sentences.zip"

# Download the dataset
response = requests.get(dataset_url)
with open("sentiment_labelled_sentences.zip", "wb") as f:
    f.write(response.content)

# Extract the dataset
import zipfile
with zipfile.ZipFile("sentiment_labelled_sentences.zip", 'r') as zip_ref:
    zip_ref.extractall("sentiment_labelled_sentences")

# Load the dataset (example: using the 'imdb_labelled.txt' file)
# List all files in the extracted directory
extracted_files = os.listdir("sentiment_labelled_sentences")
print("Extracted Files:", extracted_files)

# Correct the dataset path if necessary 
if "imdb_labelled.txt" not in extracted_files:
    print("Error: 'imdb_labelled.txt' not found in the extracted files.")
else:
    dataset_path = "sentiment_labelled_sentences/imdb_labelled.txt"
    data = pd.read_csv(dataset_path, sep="\t", header=None, names=["sentence", "label"])
    print("Dataset successfully loaded.")

Extracted Files: ['sentiment labelled sentences', '__MACOSX']
Error: 'imdb_labelled.txt' not found in the extracted files.
