In [7]:
import numpy as np
import re
import math
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample Commentary Dataset
commentary = [
    "Bumrah to Kohli, Yorker, 145 km/h, smashed for a Four!",
    "Shami to Smith, Bouncer, 140 km/h, single taken.",
    "Starc to Rohit, Full Toss, 147 km/h, Six!",
    "Bumrah to Rohit, Yorker, 145 km/h, defended solidly.",
]

# ------------------ 1️ Preprocess Text ------------------
def preprocess(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove punctuation
    return text.split()  # Tokenize

tokenized_docs = [preprocess(doc) for doc in commentary]

# ------------------ 2️ Compute Term Frequency (TF) ------------------
def compute_tf(doc):
    tf_dict = Counter(doc)
    total_words = len(doc)
    return {word: freq / total_words for word, freq in tf_dict.items()}

tf_documents = [compute_tf(doc) for doc in tokenized_docs]

# ------------------ 3️ Compute Inverse Document Frequency (IDF) ------------------
def compute_idf(doc_list):
    N = len(doc_list)  # Total number of documents
    idf_dict = {}
    all_words = set(word for doc in doc_list for word in doc)  # Unique words

    for word in all_words:
        doc_count = sum(1 for doc in doc_list if word in doc)
        idf_dict[word] = math.log((N + 1) / (doc_count + 1)) + 1  # Smoothed IDF

    return idf_dict

idf_values = compute_idf(tokenized_docs)

# ------------------ 4️ Compute TF-IDF ------------------
def compute_tfidf(tf_docs, idf_values):
    tfidf_documents = []
    for tf_doc in tf_docs:
        tfidf_doc = {word: tf_value * idf_values[word] for word, tf_value in tf_doc.items()}
        tfidf_documents.append(tfidf_doc)
    return tfidf_documents

tfidf_documents = compute_tfidf(tf_documents, idf_values)

# ------------------ 5️ Verify Using Scikit-learn ------------------
vectorizer = TfidfVectorizer(smooth_idf=False, norm=None)  # Disables smoothing & normalization

X_tfidf = vectorizer.fit_transform(commentary)

# ------------------  Display Outputs ------------------
print("\n Tokenized Documents:", tokenized_docs)

print("\n TF Values:", tf_documents)

print("\n IDF Values:", idf_values)

print("\n TF-IDF Scores:", tfidf_documents)

print("\n Feature Names:", vectorizer.get_feature_names_out())

print("\n TF-IDF Matrix:\n", X_tfidf.toarray())




 Tokenized Documents: [['bumrah', 'to', 'kohli', 'yorker', '145', 'kmh', 'smashed', 'for', 'a', 'four'], ['shami', 'to', 'smith', 'bouncer', '140', 'kmh', 'single', 'taken'], ['starc', 'to', 'rohit', 'full', 'toss', '147', 'kmh', 'six'], ['bumrah', 'to', 'rohit', 'yorker', '145', 'kmh', 'defended', 'solidly']]

 TF Values: [{'bumrah': 0.1, 'to': 0.1, 'kohli': 0.1, 'yorker': 0.1, '145': 0.1, 'kmh': 0.1, 'smashed': 0.1, 'for': 0.1, 'a': 0.1, 'four': 0.1}, {'shami': 0.125, 'to': 0.125, 'smith': 0.125, 'bouncer': 0.125, '140': 0.125, 'kmh': 0.125, 'single': 0.125, 'taken': 0.125}, {'starc': 0.125, 'to': 0.125, 'rohit': 0.125, 'full': 0.125, 'toss': 0.125, '147': 0.125, 'kmh': 0.125, 'six': 0.125}, {'bumrah': 0.125, 'to': 0.125, 'rohit': 0.125, 'yorker': 0.125, '145': 0.125, 'kmh': 0.125, 'defended': 0.125, 'solidly': 0.125}]

 IDF Values: {'kmh': 1.0, 'to': 1.0, 'taken': 1.916290731874155, 'smashed': 1.916290731874155, 'starc': 1.916290731874155, 'kohli': 1.916290731874155, 'bouncer': 1.9