## TF-IDF


In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np

# Sample text documents
documents = [
    "TF-IDF helps identify important words in a document.",
    "Text classification is a popular NLP application.",
    "Word embeddings improve semantic understanding.",
    "TF-IDF is a key technique in text analysis."
]

# Compute raw counts using CountVectorizer
count_vectorizer = CountVectorizer()
raw_count_matrix = count_vectorizer.fit_transform(documents)

# Normalise raw counts (divide each word count by the total word count in the document)
raw_count_array = raw_count_matrix.toarray()
normalised_counts = raw_count_array / raw_count_array.sum(axis=1, keepdims=True)

# Convert raw counts and normalised counts to DataFrames
feature_names = count_vectorizer.get_feature_names_out()
raw_count_df = pd.DataFrame(raw_count_array, columns=feature_names).round(2)
normalised_count_df = pd.DataFrame(normalised_counts, columns=feature_names).round(2)

# Compute TF-IDF values using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out()).round(2)

# Ensure all columns and rows are displayed in full when printing
pd.set_option("display.max_columns", None)  # Ensure all columns are printed
pd.set_option("display.width", 1000)  # Set the display width to prevent line breaks
pd.set_option("display.max_rows", None)  # Ensure all rows are printed

# Display results
print("Raw Counts:")
print(raw_count_df.to_string(index=False))  # Print the full table without splitting

print("\nNormalised Counts:")
print(normalised_count_df.to_string(index=False))  # Print the full table without splitting

print("\nTF-IDF Matrix:")
print(tfidf_df.to_string(index=False))  # Print the full table without splitting


Raw Counts:
 analysis  application  classification  document  embeddings  helps  identify  idf  important  improve  in  is  key  nlp  popular  semantic  technique  text  tf  understanding  word  words
        0            0               0         1           0      1         1    1          1        0   1   0    0    0        0         0          0     0   1              0     0      1
        0            1               1         0           0      0         0    0          0        0   0   1    0    1        1         0          0     1   0              0     0      0
        0            0               0         0           1      0         0    0          0        1   0   0    0    0        0         1          0     0   0              1     1      0
        1            0               0         0           0      0         0    1          0        0   1   1    1    0        0         0          1     1   1              0     0      0

Normalised Counts:
 analysis  application 

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np

# Sample text documents
documents = [
    "TF-IDF helps identify important words in a document.",
    "Text classification is a popular NLP application.",
    "Word embeddings improve semantic understanding.",
    "TF-IDF is a key technique in text analysis."
]

# Compute raw counts using CountVectorizer with custom token pattern
count_vectorizer = CountVectorizer(token_pattern=r"\b\w+[-]?\w+\b")  # regular expression to handle hyphenated words
raw_count_matrix = count_vectorizer.fit_transform(documents)

# Normalise raw counts (divide each word count by the total word count in the document)
raw_count_array = raw_count_matrix.toarray()  # Convert sparse matrix to dense NumPy array
normalised_counts = raw_count_array / raw_count_array.sum(axis=1, keepdims=True)  # Perform normalisation

# Convert raw counts and normalised counts to DataFrames
feature_names = count_vectorizer.get_feature_names_out()
raw_count_df = pd.DataFrame(raw_count_array, columns=feature_names).round(2)
normalised_count_df = pd.DataFrame(normalised_counts, columns=feature_names).round(2)

# Compute TF-IDF values using TfidfVectorizer with custom token pattern
tfidf_vectorizer = TfidfVectorizer(token_pattern=r"\b\w+[-]?\w+\b")  # Use the same regex
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out()).round(2)


pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)  # Set the display width to prevent line breaks
pd.set_option("display.max_rows", None)


print("Raw Counts:")
print(raw_count_df.to_string(index=False))

print("\nNormalised Counts:")
print(normalised_count_df.to_string(index=False))

print("\nTF-IDF Matrix:")
print(tfidf_df.to_string(index=False))


Raw Counts:
 analysis  application  classification  document  embeddings  helps  identify  important  improve  in  is  key  nlp  popular  semantic  technique  text  tf-idf  understanding  word  words
        0            0               0         1           0      1         1          1        0   1   0    0    0        0         0          0     0       1              0     0      1
        0            1               1         0           0      0         0          0        0   0   1    0    1        1         0          0     1       0              0     0      0
        0            0               0         0           1      0         0          0        1   0   0    0    0        0         1          0     0       0              1     1      0
        1            0               0         0           0      0         0          0        0   1   1    1    0        0         0          1     1       1              0     0      0

Normalised Counts:
 analysis  application  clas

In [3]:
question = "Which document mentions the importance of TF-IDF?"
tfidf_question = tfidf_vectorizer.transform([question])

#Compute cosine similarity between the question and each document
cosine_similarities = (tfidf_matrix @ tfidf_question.T).toarray().flatten()

#Identify the document with the highest similarity
most_relevant_index = np.argmax(cosine_similarities)
most_relevant_document = documents[most_relevant_index]

# Display Results
print(f"Question: {question}\n")
#print("TF-IDF Matrix:")
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
#print(tfidf_df)

print("\nCosine Similarities:")
for i, score in enumerate(cosine_similarities):
    print(f"Document {i + 1}: {score:.4f}")

print(f"\nThe document that best answers the question is:\n'{most_relevant_document}'")



Question: Which document mentions the importance of TF-IDF?


Cosine Similarities:
Document 1: 0.5096
Document 2: 0.0000
Document 3: 0.0000
Document 4: 0.2084

The document that best answers the question is:
'TF-IDF helps identify important words in a document.'
