In [1]:
# pip install pandas scikit-learn

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [
"Python is a great programming language for data science.",
"Data science and machine learning are popular fields.",
"Machine learning relies on mathematics and programming."
]

In [3]:
# Compute TF-IDF Scores (No Stop Words)

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Convert to a DataFrame for better readability
tfidf_df = pd.DataFrame(
tfidf_matrix.toarray(),
columns=tfidf_vectorizer.get_feature_names_out()
)
print(tfidf_df)
print("\n1. Which term has the highest TF-IDF score in each document?")
print("The term(s) with the highest TF-IDF score in each document are:")
print("Document 0 term(s) are python, great, is, for, language with a TF-IDF score of 0.385323")
print("Document 1 term(s) are popular, fields, are with a TF-IDF score of 0.411973")
print("Document 2 term(s) are mathematics, relies, on with a TF-IDF score of 0.433816")
print("\n2. Are there any terms with a score of 0? Why?")
print("Yes, there are many terms with a score of 0.000000. This is because term Frequency is 0 for some words in a document")
print("Example: The word 'python' has a score of 0.000000 in Document 2 because the word 'python' does not appear in the second sentence.")

        and       are      data    fields       for     great        is  \
0  0.000000  0.000000  0.293048  0.000000  0.385323  0.385323  0.385323   
1  0.313316  0.411973  0.313316  0.411973  0.000000  0.000000  0.000000   
2  0.329928  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

   language  learning   machine  mathematics        on   popular  programming  \
0  0.385323  0.000000  0.000000     0.000000  0.000000  0.000000     0.293048   
1  0.000000  0.313316  0.313316     0.000000  0.000000  0.411973     0.000000   
2  0.000000  0.329928  0.329928     0.433816  0.433816  0.000000     0.329928   

     python    relies   science  
0  0.385323  0.000000  0.293048  
1  0.000000  0.000000  0.313316  
2  0.000000  0.433816  0.000000  

1. Which term has the highest TF-IDF score in each document?
The term(s) with the highest TF-IDF score in each document are:
Document 0 term(s) are python, great, is, for, language with a TF-IDF score of 0.385323
Document 1 term(s) are p

In [8]:
# Customize the TF-IDF Vectorizer

# Modify the TfidfVectorizer to remove common English stop words.
# Use max_features=10 to limit the matrix to the top 10 terms.
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10)

# Fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Convert to a DataFrame for better readability
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

print(tfidf_df)
print("\nIdentify the top 3 terms across the corpus with the highest importance.")
print("\nThe top 3 terms across the entire corpus with the highest importance are:")
print("Document 0: great and language, tied with a TF-IDF score of 0.51742 ")
print("Document 1: fields and popular, tied with a TF-IDF score of 0.481482")
print("Document 2: mathematics with the highest TF-IDF score of 0.604652")

       data    fields    great  language  learning   machine  mathematics  \
0  0.393511  0.000000  0.51742   0.51742  0.000000  0.000000     0.000000   
1  0.366180  0.481482  0.00000   0.00000  0.366180  0.366180     0.000000   
2  0.000000  0.000000  0.00000   0.00000  0.459854  0.459854     0.604652   

    popular  programming   science  
0  0.000000     0.393511  0.393511  
1  0.481482     0.000000  0.366180  
2  0.000000     0.459854  0.000000  

Identify the top 3 terms across the corpus with the highest importance.

The top 3 terms across the entire corpus with the highest importance are:
Document 0: great and language, tied with a TF-IDF score of 0.51742 
Document 1: fields and popular, tied with a TF-IDF score of 0.481482
Document 2: mathematics with the highest TF-IDF score of 0.604652


In [9]:
#Term Importance Analysis (from the customized version)

# Sum TF-IDF scores across all documents
term_importance = tfidf_matrix.sum(axis=0).A1
terms = tfidf_vectorizer.get_feature_names_out()
# Create a DataFrame of terms and their importance
importance_df = pd.DataFrame({"Term": terms, "Importance": term_importance})
importance_df = importance_df.sort_values(by="Importance", ascending=False)
print(importance_df)
print("\n1. Which terms are the most important in the corpus?")
print("\nThe top 3 most important terms are:")
print("programming (Total Importance: 0.853365)")
print("learning (Total Importance: 0.826034)")
print("machine (Total Importance: 0.826034)")
print("\n2. How do these terms relate to the overall content?")
print("The important terms, programming, learning, and machine, clearly define the subject as Machine Learning." 
        "Programming acts as the crucial link, showing that code is necessary for Machine Learning by appearing, "
        "in both the sentence about Python and the sentence about technical requirements."
)


          Term  Importance
8  programming    0.853365
4     learning    0.826033
5      machine    0.826033
0         data    0.759691
9      science    0.759691
6  mathematics    0.604652
2        great    0.517420
3     language    0.517420
1       fields    0.481482
7      popular    0.481482

1. Which terms are the most important in the corpus?

The top 3 most important terms are:
programming (Total Importance: 0.853365)
learning (Total Importance: 0.826034)
machine (Total Importance: 0.826034)

2. How do these terms relate to the overall content?
The important terms, programming, learning, and machine, clearly define the subject as Machine Learning.Programming acts as the crucial link, showing that code is necessary for Machine Learning by appearing, in both the sentence about Python and the sentence about technical requirements.


In [10]:
#Apply TF-IDF to Your Own Dataset

new_documents = [
"Books hold stories, and stories live in books.",
"Books build minds, and minds build books.",
"Through books and minds, stories never end."
]

#Exclude stop words
# Use max_features=10 to limit the matrix to the top 10 terms.
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10)

# Fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(new_documents)

# Convert to a DataFrame for better readability
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

print(tfidf_df)

      books     build       end      hold      live     minds   stories
0  0.494378  0.000000  0.000000  0.418527  0.418527  0.000000  0.636601
1  0.425441  0.720333  0.000000  0.000000  0.000000  0.547832  0.000000
2  0.373119  0.000000  0.631745  0.000000  0.000000  0.480458  0.480458


In [11]:
# Sum TF-IDF scores across all documents
term_importance = tfidf_matrix.sum(axis=0).A1
terms = tfidf_vectorizer.get_feature_names_out()
# Create a DataFrame of terms and their importance
importance_df = pd.DataFrame({"Term": terms, "Importance": term_importance})
importance_df = importance_df.sort_values(by="Importance", ascending=False)
print(importance_df.head(5))
print(
    "\nThe most important term in my dataset is books (1.292937), "
    "because its high score shows it is the central subject of every sentence. "
    "       The text uses \"books\" as the recurring item that both holds stories and helps build minds."
)

      Term  Importance
0    books    1.292937
6  stories    1.117060
5    minds    1.028291
1    build    0.720333
2      end    0.631745

The most important term in my dataset is books (1.292937), because its high score shows it is the central subject of every sentence.        The text uses "books" as the recurring item that both holds stories and helps build minds.
