In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import numpy as np
from string import punctuation
from nltk.tokenize import word_tokenize

# Assign documents
d0 = 'the quick brown fox jumps over the lazy dog!!!'
d1 = 'the lazy dog sleeps in the warm sunlight.'
d2 = 'the quick brown fox is very agile and clever??'
d3 = 'the sunlight, makes the lazy dog feel warm.'
d4 = 'the agile fox jumps over the brown dog.'


def clean_txt(sent):
    tokens = word_tokenize(sent.lower())
    stop_updated = list(punctuation)
    final_word = [term for term in tokens if term not in stop_updated 
               and len(term) > 2]
    res = " ".join(final_word)
    return res

# Function to remove punctuation
#def remove_punctuation(text):
#    return re.sub(r'[^\w\s]', '', text)

# Remove punctuation
# d0 = remove_punctuation(d0)
# d1 = remove_punctuation(d1)
# d2 = remove_punctuation(d2)
# d3 = remove_punctuation(d3)
# d4 = remove_punctuation(d4)

d0 = clean_txt(d0)
d1 = clean_txt(d1)
d2 = clean_txt(d2)
d3 = clean_txt(d3)
d4 = clean_txt(d4)

# Merge documents into a single corpus
string = [d0, d1, d2, d3, d4]

# Define custom stop words
stop_words = ["the", "over", "in", "fox", "dog", "very", "and", "is"]

# Create TF-IDF vectorizer object with custom stop words
tfidf = TfidfVectorizer(stop_words=stop_words)


In [19]:
# Get TF-IDF values
result = tfidf.fit_transform(string)

# Get feature names
feature_names = tfidf.get_feature_names_out()

# Get TF-IDF values for "quick" in d0
index_quick = np.where(feature_names == 'quick')[0][0]
tfidf_values = result.toarray()
tfidf_quick_d0 = tfidf_values[0][index_quick]

print(f"TF-IDF value for 'quick' in d0: {tfidf_quick_d0}")



TF-IDF value for 'quick' in d0: 0.5440812430630017


In [24]:
# Get TF-IDF values for Sentence 2
tfidf_values = result.toarray()
tfidf_sentence_2 = tfidf_values[1]  # Sentence 2 is at index 1

# Find the word with the highest TF-IDF value in Sentence 2
max_tfidf_index = np.argmax(tfidf_sentence_2)
max_tfidf_word = feature_names[max_tfidf_index]
max_tfidf_value = tfidf_sentence_2[max_tfidf_index]

print(f"Word with the highest TF-IDF value in Sentence 2: '{max_tfidf_word}' with a value of {max_tfidf_value:.4f}")



Word with the highest TF-IDF value in Sentence 2: 'sleeps' with a value of 0.6030


In [25]:
# Get TF-IDF values for Sentence 4
tfidf_values = result.toarray()
tfidf_sentence_4 = tfidf_values[3]  # Sentence 4 is at index 3

# Get the TF-IDF value for 'lazy' in Sentence 4
index_lazy = np.where(feature_names == 'lazy')[0][0]
tfidf_lazy_d4 = tfidf_sentence_4[index_lazy]

print(f"TF-IDF value for 'lazy' in Sentence 4: {tfidf_lazy_d4:.4f}")

TF-IDF value for 'lazy' in Sentence 4: 0.3458


In [23]:
# Get TF-IDF values for each sentence
tfidf_values = result.toarray()

# Find the index of the word 'sunlight'
index_sunlight = np.where(feature_names == 'sunlight')[0][0]

# Extract TF-IDF values for 'sunlight' in all sentences
tfidf_sunlight = tfidf_values[:, index_sunlight]

# Find the sentence with the highest TF-IDF value for 'sunlight'
max_tfidf_index = np.argmax(tfidf_sunlight)
max_tfidf_sentence = string[max_tfidf_index]
max_tfidf_value = tfidf_sunlight[max_tfidf_index]

print(f"Sentence with the highest TF-IDF value for 'sunlight': '{max_tfidf_sentence}'")
print(f"TF-IDF value: {max_tfidf_value:.4f}")

Sentence with the highest TF-IDF value for 'sunlight': 'the lazy dog sleeps the warm sunlight'
TF-IDF value: 0.4865


In [26]:
# Get TF-IDF values for Sentence 5
tfidf_values = result.toarray()
tfidf_sentence_5 = tfidf_values[4]  # Sentence 5 is at index 4

# Get the TF-IDF value for 'brown' in Sentence 5
index_brown = np.where(feature_names == 'brown')[0][0]
tfidf_brown_d5 = tfidf_sentence_5[index_brown]

print(f"TF-IDF value for 'brown' in Sentence 5: {tfidf_brown_d5:.4f}")


TF-IDF value for 'brown' in Sentence 5: 0.5062


In [27]:
from sklearn.feature_extraction.text import CountVectorizer

# Define the sentences
documents = [
    'the quick brown fox jumps over the lazy dog',
    'the lazy dog sleeps in the warm sunlight',
    'the quick brown fox is very agile and clever',
    'the sunlight makes the lazy dog feel warm',
    'the agile fox jumps over the brown dog'
]

# Define stop words
stop_words = {'the', 'over', 'in', 'fox', 'dog', 'very', 'and', 'is'}

def preprocess(document):
    # Remove punctuation and lowercase words
    words = document.lower().replace('.', '').replace(',', '').replace('!', '').replace('?', '').split()
    # Remove stop words
    return ' '.join(word for word in words if word not in stop_words)

# Preprocess documents
cleaned_documents = [preprocess(doc) for doc in documents]

# Create a CountVectorizer object for unigrams and bigrams
vectorizer = CountVectorizer(ngram_range=(1, 2))  # (1, 2) means unigrams and bigrams

# Fit and transform the cleaned documents
count_matrix = vectorizer.fit_transform(cleaned_documents)

# Get the feature names
feature_names = vectorizer.get_feature_names_out()

# Count the total number of features
total_features = len(feature_names)

# Print the result
print(f"Total number of features (unigrams + bigrams): {total_features}")


Total number of features (unigrams + bigrams): 25


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the documents
documents = [
    'the quick brown fox jumps over the lazy dog',
    'the lazy dog sleeps in the warm sunlight',
    'the quick brown fox is very agile and clever',
    'the sunlight makes the lazy dog feel warm',
    'the agile fox jumps over the brown dog'
]

# Define stop words
stop_words = {'the', 'over', 'in', 'fox', 'dog', 'very', 'and', 'is'}

def preprocess(document):
    # Remove punctuation and lowercase words
    words = document.lower().replace('.', '').replace(',', '').replace('!', '').replace('?', '').split()
    # Remove stop words
    return ' '.join(word for word in words if word not in stop_words)

# Preprocess documents
cleaned_documents = [preprocess(doc) for doc in documents]


# Create a TF-IDF vectorizer with both unigrams and bigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Fit and transform the documents
X = vectorizer.fit_transform(documents)

# Get the feature names
features = vectorizer.get_feature_names_out()

# Number of features
num_features = len(features)
print(f"Total number of features: {num_features}")


Total number of features: 46
