In [66]:
# Step 1: Set up

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
# Sample corpus
documents = [
"Data science is an interdisciplinary field.",
"Machine learning is a subset of artificial intelligence.",
"Data science uses machine learning algorithms.",
"Artificial intelligence and data science are growing fields."
]

print ("\nStep 1: Set up")
print("\nDocuments:")
print(documents)


Step 1: Set up

Documents:
['Data science is an interdisciplinary field.', 'Machine learning is a subset of artificial intelligence.', 'Data science uses machine learning algorithms.', 'Artificial intelligence and data science are growing fields.']


In [68]:
#Step 2: Bag of Words

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the documents to create the BoW matrix
bow_matrix = vectorizer.fit_transform(documents)

#Convert the result into a DataFrame for readability
bow_df = pd.DataFrame(bow_matrix.toarray(), 
                      columns=vectorizer.get_feature_names_out(), 
                      index=[f"Doc {i+1}" for i in range(len(documents))])

print ("\nStep2: Bag of Words")
print ("\nBag of Words Model:")
print ()
print (bow_df)
print ()
print ("\n1. The two words that appear most frequently across all documents are 'data' and 'science',"
       " both with a total count of three.")
print ("\n2. There are eleven words that appear only once across all four documents: 'algorithms',"
      " 'an', 'and', 'are', 'field', 'fields', 'growing', 'interdisciplinary', 'of', 'subset', and 'uses'.")


Step2: Bag of Words

Bag of Words Model:

       algorithms  an  and  are  artificial  data  field  fields  growing  \
Doc 1           0   1    0    0           0     1      1       0        0   
Doc 2           0   0    0    0           1     0      0       0        0   
Doc 3           1   0    0    0           0     1      0       0        0   
Doc 4           0   0    1    1           1     1      0       1        1   

       intelligence  interdisciplinary  is  learning  machine  of  science  \
Doc 1             0                  1   1         0        0   0        1   
Doc 2             1                  0   1         1        1   1        0   
Doc 3             0                  0   0         1        1   0        1   
Doc 4             1                  0   0         0        0   0        1   

       subset  uses  
Doc 1       0     0  
Doc 2       1     0  
Doc 3       0     1  
Doc 4       0     0  


1. The two words that appear most frequently across all documents ar

In [69]:
#Step 3: TF-IDF

# Create a TF-IDF representation of the documents using TfidfVectorizer.
vectorizer_tfidf = TfidfVectorizer()
tfidf_matrix = vectorizer_tfidf.fit_transform(documents)

# Convert the resulting TF-IDF matrix into a DataFrame for readability
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=vectorizer_tfidf.get_feature_names_out(),
    index=[f"Doc {i+1}" for i in range(len(documents))]
)

print("\nStep 3: TF-IDF")
print("\nTF-IDF Representation")
print ()
print(tfidf_df.round(3))
print("\n1. The terms that have the highest TF-IDF score in the first document (Doc 1) are 'an', 'field'," 
        " and 'interdisciplinary', all with a score of 0.475.")
print("\n2. Some terms have a TF-IDF score of 0.000 in certain documents because the Term Frequency (TF) component" 
      " of the calculation is zero, meaning that specific word does not appear in that particular document.")


Step 3: TF-IDF

TF-IDF Representation

       algorithms     an    and    are  artificial   data  field  fields  \
Doc 1       0.000  0.475  0.000  0.000       0.000  0.303  0.475   0.000   
Doc 2       0.000  0.000  0.000  0.000       0.349  0.000  0.000   0.000   
Doc 3       0.496  0.000  0.000  0.000       0.000  0.317  0.000   0.000   
Doc 4       0.000  0.000  0.406  0.406       0.320  0.259  0.000   0.406   

       growing  intelligence  interdisciplinary     is  learning  machine  \
Doc 1    0.000         0.000              0.475  0.374     0.000    0.000   
Doc 2    0.000         0.349              0.000  0.349     0.349    0.349   
Doc 3    0.000         0.000              0.000  0.000     0.391    0.391   
Doc 4    0.406         0.320              0.000  0.000     0.000    0.000   

          of  science  subset   uses  
Doc 1  0.000    0.303   0.000  0.000  
Doc 2  0.442    0.000   0.442  0.000  
Doc 3  0.000    0.317   0.000  0.496  
Doc 4  0.000    0.259   0.000  0.000 

In [71]:
#Step4: N-grams

# Generate bigrams (2-word combinations) using CountVectorizer with ngram_range=(2, 2)
vectorizer_bigram = CountVectorizer(ngram_range=(2, 2))
bigram_matrix = vectorizer_bigram.fit_transform(documents)

# Convert the resulting bigram matrix into a DataFrame for readability
bigram_df = pd.DataFrame(
    bigram_matrix.toarray(),
    columns=vectorizer_bigram.get_feature_names_out(),
    index=[f"Doc {i+1}" for i in range(len(documents))]
)

print ("\nStep4: N-grams")
print("\nBigram (N-gram) Representation")
print ()
print(bigram_df)
print ("\n1. The bigram that is most frequent across all documents is 'data science'," 
       " which appears three times (once in Doc 1, Doc 3, and Doc 4).")
print ("\n2. Bigrams provide additional context compared to unigrams (single words) by" 
       " capturing the sequence and relationship between two adjacent words, which helps" 
       " differentiate meanings (e.g., 'data science' versus just 'data' or 'science').") 


Step4: N-grams

Bigram (N-gram) Representation

       an interdisciplinary  and data  are growing  artificial intelligence  \
Doc 1                     1         0            0                        0   
Doc 2                     0         0            0                        1   
Doc 3                     0         0            0                        0   
Doc 4                     0         1            1                        1   

       data science  growing fields  intelligence and  \
Doc 1             1               0                 0   
Doc 2             0               0                 0   
Doc 3             1               0                 0   
Doc 4             1               1                 1   

       interdisciplinary field  is an  is subset  learning algorithms  \
Doc 1                        1      1          0                    0   
Doc 2                        0      0          1                    0   
Doc 3                        0      0          0  

In [72]:
#Step 5: Analyze Combined Representations

print ("\nStep 5:Analyze Combined Representations") 
print("\nCompare the results from BoW and TF-IDF:")
print("\n1. BoW simply counts how often a term appears (raw frequency), giving a high score" 
      " to common words like 'is' (count of 2 in your data); in contrast, TF-IDF down-weights terms" 
      " that appear frequently across many documents, resulting in a low TF-IDF score for 'is'" 
      " (0.374 in Doc 1) because it's not unique.") 
print ("\n2. TF-IDF is preferred when you need to focus on the most distinctive and significant terms" 
       " in each document, rather than common words that offer little unique information. For instance," 
       " 'interdisciplinary' only appears once, so TF-IDF gives it a high score (0.475 in Doc 1), highlighting" 
       " its importance to that specific document.The high scores assigned to these unique terms help data analysts" 
       " quickly identify the main topic or content of a document when performing tasks like information retrieval or" 
       " document classification.")

print("\nCompare unigrams (single words) with bigrams:")
print ("\n1. Bigrams capture relationships by treating an ordered pair of words as a single feature, showing words" 
       " that commonly appear adjacent to each other. This is important to people that analyze the dataset because" 
       " it helps them understand context and differentiate between specific concepts or phrases that individual words" 
       " might miss.")
print ("\n2. For example, the bigram 'data science' is a specific field, which adds more meaning than just analyzing" 
        " the individual words 'data' and 'science' separately.The bigram 'data science' adds more meaning than the" 
        " individual words 'data' and 'science' because it functions as a single, specific term that names an entire" 
        " academic discipline and professional field.")


Step 5:Analyze Combined Representations

Compare the results from BoW and TF-IDF:

1. BoW simply counts how often a term appears (raw frequency), giving a high scoreto common words like 'is' (count of 2 in your data); in contrast, TF-IDF down-weights termsthat appear frequently across many documents, resulting in a low TF-IDF score for 'is'(0.374 in Doc 1) because it's not unique.

2. TF-IDF is preferred when you need to focus on the most distinctive and significant termsin each document, rather than common words that offer little unique information. For instance,'interdisciplinary' only appears once, so TF-IDF gives it a high score (0.475 in Doc 1), highlightingits importance to that specific document.The high scores assigned to these unique terms help data analystsquickly identify the main topic or content of a document when performing tasks like information retrieval ordocument classification.

Compare unigrams (single words) with bigrams:

1. Bigrams capture relationships by treat