In [4]:
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

data = ["He is William",
        "He is Peter",
        "He isn't Antony who study Mathematics"]

# fit and tranform your data
vectorizer = TfidfVectorizer()
vectorized_matrix = vectorizer.fit_transform(data)
vectorized_tokens = vectorizer.get_feature_names_out(data)

print(vectorized_tokens)
print("")
print(vectorized_matrix)
print("")
print(vectorizer.get_stop_words())


['antony' 'he' 'is' 'isn' 'mathematics' 'peter' 'study' 'who' 'william']

  (0, 8)	0.7203334490549893
  (0, 2)	0.5478321549274363
  (0, 1)	0.4254405389711991
  (1, 5)	0.7203334490549893
  (1, 2)	0.5478321549274363
  (1, 1)	0.4254405389711991
  (2, 4)	0.4323850887896905
  (2, 6)	0.4323850887896905
  (2, 7)	0.4323850887896905
  (2, 0)	0.4323850887896905
  (2, 3)	0.4323850887896905
  (2, 1)	0.25537359879528915

None


In [5]:
# Initialize the TfidfVectorizer with default parameters
vectorizer = TfidfVectorizer()

# Fit and transform the data to calculate TF-IDF
tfidf_matrix = vectorizer.fit_transform(data)

# Get the feature names (unique words)
feature_names = vectorizer.get_feature_names_out()

# Get the TF-IDF values in a dense matrix
dense_tfidf_matrix = tfidf_matrix.todense()

# Create a DataFrame for TF-IDF values
tfidf_df = pd.DataFrame(dense_tfidf_matrix, columns=feature_names)
print("TF-IDF Matrix:")
print(tfidf_df)


TF-IDF Matrix:
     antony        he        is       isn  mathematics     peter     study  \
0  0.000000  0.425441  0.547832  0.000000     0.000000  0.000000  0.000000   
1  0.000000  0.425441  0.547832  0.000000     0.000000  0.720333  0.000000   
2  0.432385  0.255374  0.000000  0.432385     0.432385  0.000000  0.432385   

        who   william  
0  0.000000  0.720333  
1  0.000000  0.000000  
2  0.432385  0.000000  


In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the small corpus
data = ["He is William",
        "He is Peter",
        "He isn't Antony who study Mathematics"]

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the data to calculate TF-IDF
tfidf_matrix = vectorizer.fit_transform(data)

# Get the feature names (unique words)
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a dense format
dense_tfidf_matrix = tfidf_matrix.todense()

# Create a DataFrame for TF-IDF values
tfidf_df = pd.DataFrame(dense_tfidf_matrix, columns=feature_names)
print("TF-IDF Matrix:")
print(tfidf_df)

# Calculate Term Frequency (TF)
def compute_tf(term, document):
    return document.count(term) / len(document.split())

tf_values = {term: [compute_tf(term, doc) for doc in data] for term in feature_names}
tf_df = pd.DataFrame(tf_values)
print("\nTerm Frequency (TF) Matrix:")
print(tf_df)

# Calculate Inverse Document Frequency (IDF)
def compute_idf(term, documents):
    num_docs_containing_term = sum(1 for doc in documents if term in doc.split())
    return np.log((1 + len(documents)) / (1 + num_docs_containing_term)) + 1

idf_values = {term: compute_idf(term, data) for term in feature_names}
idf_df = pd.DataFrame(idf_values, index=["IDF"]).T
print("\nInverse Document Frequency (IDF) Values:")
print(idf_df)

# Calculate TF-IDF manually and normalize
tfidf_manual_df = tf_df * idf_df.T.values
tfidf_manual_df_normalized = tfidf_manual_df.div(np.sqrt((tfidf_manual_df**2).sum(axis=1)), axis=0)
print("\nTF-IDF Matrix (Manual Calculation, Normalized):")
print(tfidf_manual_df_normalized)


TF-IDF Matrix:
     antony        he        is       isn  mathematics     peter     study  \
0  0.000000  0.425441  0.547832  0.000000     0.000000  0.000000  0.000000   
1  0.000000  0.425441  0.547832  0.000000     0.000000  0.720333  0.000000   
2  0.432385  0.255374  0.000000  0.432385     0.432385  0.000000  0.432385   

        who   william  
0  0.000000  0.720333  
1  0.000000  0.000000  
2  0.432385  0.000000  

Term Frequency (TF) Matrix:
   antony        he        is       isn  mathematics  peter     study  \
0     0.0  0.000000  0.333333  0.000000          0.0    0.0  0.000000   
1     0.0  0.000000  0.333333  0.000000          0.0    0.0  0.000000   
2     0.0  0.166667  0.166667  0.166667          0.0    0.0  0.166667   

        who  william  
0  0.000000      0.0  
1  0.000000      0.0  
2  0.166667      0.0  

Inverse Document Frequency (IDF) Values:
                  IDF
antony       2.386294
he           2.386294
is           1.287682
isn          2.386294
mathematic