##  Step 1: Install & Import Required Libraries

In [20]:
# Install necessary libraries (if not already installed)
#!pip install nltk spacy gensim

# Download necessary NLTK data
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Load Spacy model
import spacy
nlp = spacy.load("en_core_web_sm")

# Import required libraries
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gowtham.T\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gowtham.T\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# !python -m spacy download en_core_web_sm


 ### Step 2: Read & Display the Text File

In [21]:
# Read the text file
with open("sample.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Display the original text
print("Original Text:\n")
print(text[:500])  # Print first 500 characters


Original Text:

Artificial Intelligence (AI) has evolved significantly over the past few decades. 
Early AI systems were rule-based, meaning they followed predefined instructions and couldn’t adapt to new information. 
However, with the rise of Machine Learning, AI began to learn from data instead of relying on fixed rules.

One major breakthrough was the development of Neural Networks, which mimic the human brain. 
These networks improved AI’s ability to recognize patterns, understand language, and generate hu


In [22]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gowtham.T\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Gowtham.T\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### Step 3: Preprocess the Text (Lowercasing, Removing Punctuation, Tokenization, Stopwords, Lemmatization)

In [23]:
nltk.data.path.append("C:/Users/Gowtham.T/AppData/Roaming/nltk_data")


In [24]:
import nltk
import spacy
import string
from nltk.corpus import stopwords

# Download NLTK resources
nltk.download('stopwords')  # For stopwords

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")  # Load SpaCy's small English model

# Sample text
text = "This is an example sentence! The quick brown fox jumped over the lazy dog."

# Convert text to lowercase
text = text.lower()

# Remove punctuation
text = text.translate(str.maketrans("", "", string.punctuation))

# Tokenization using SpaCy (instead of NLTK word_tokenize)
doc = nlp(text)
tokens = [token.text for token in doc]

# Remove stopwords
stop_words = set(stopwords.words("english"))
filtered_tokens = [word for word in tokens if word not in stop_words]

# Lemmatization using SpaCy
lemmatized_tokens = [token.lemma_ for token in nlp(" ".join(filtered_tokens))]

# Display results
print("\nFiltered Tokens (First 20):", filtered_tokens[:20])
print("\nLemmatized Tokens (First 20):", lemmatized_tokens[:20])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gowtham.T\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Filtered Tokens (First 20): ['example', 'sentence', 'quick', 'brown', 'fox', 'jumped', 'lazy', 'dog']

Lemmatized Tokens (First 20): ['example', 'sentence', 'quick', 'brown', 'fox', 'jump', 'lazy', 'dog']


### Use the Word2Vec model from Gensim.

In [25]:
import gensim
from gensim.models import Word2Vec

# Example: Word2Vec model training
# First, we need to prepare our data in the form of sentences (a list of tokens)
sentences = [filtered_tokens]  # You can extend this to more sentences for training.

# Train the Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0)

# Get vector for a word (for example: 'dog')
word_vector = model.wv['dog']
print(f"Vector for 'dog': \n", word_vector)

# Find similar words to a given word (for example: 'dog')
similar_words = model.wv.most_similar('dog', topn=5)
print("\nMost similar words to 'dog':", similar_words)


Vector for 'dog': 
 [-5.3622725e-04  2.3643136e-04  5.1033497e-03  9.0092728e-03
 -9.3029495e-03 -7.1168090e-03  6.4588725e-03  8.9729885e-03
 -5.0154282e-03 -3.7633716e-03  7.3805046e-03 -1.5334714e-03
 -4.5366134e-03  6.5540518e-03 -4.8601604e-03 -1.8160177e-03
  2.8765798e-03  9.9187379e-04 -8.2852151e-03 -9.4488179e-03
  7.3117660e-03  5.0702621e-03  6.7576934e-03  7.6286553e-04
  6.3508903e-03 -3.4053659e-03 -9.4640139e-04  5.7685734e-03
 -7.5216377e-03 -3.9361035e-03 -7.5115822e-03 -9.3004224e-04
  9.5381187e-03 -7.3191668e-03 -2.3337686e-03 -1.9377411e-03
  8.0774371e-03 -5.9308959e-03  4.5162440e-05 -4.7537340e-03
 -9.6035507e-03  5.0072931e-03 -8.7595852e-03 -4.3918253e-03
 -3.5099984e-05 -2.9618145e-04 -7.6612402e-03  9.6147433e-03
  4.9820580e-03  9.2331432e-03 -8.1579173e-03  4.4957981e-03
 -4.1370760e-03  8.2453608e-04  8.4986202e-03 -4.4621765e-03
  4.5175003e-03 -6.7869602e-03 -3.5484887e-03  9.3985079e-03
 -1.5776526e-03  3.2137157e-04 -4.1406299e-03 -7.6826881e-03
 -1.

### Step 6: Applying Bag of Words (BoW)

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents (You can replace it with your tokenized sentences)
documents = [" ".join(filtered_tokens)]  # Use multiple sentences here as a list

# Create the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the data (Converts text to BoW representation)
X = vectorizer.fit_transform(documents)

# Get the feature names (vocabulary) from the model
vocabulary = vectorizer.get_feature_names_out()
print("\nVocabulary:", vocabulary)

# Display the BoW matrix
print("\nBag of Words matrix:\n", X.toarray())



Vocabulary: ['brown' 'dog' 'example' 'fox' 'jumped' 'lazy' 'quick' 'sentence']

Bag of Words matrix:
 [[1 1 1 1 1 1 1 1]]


###  TF-IDF (Term Frequency-Inverse Document Frequency)


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents (You can use more documents here)
documents = [" ".join(filtered_tokens)]  # Add more documents for better results

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the documents
X_tfidf = tfidf_vectorizer.fit_transform(documents)

# Get the feature names (vocabulary) from the model
vocabulary_tfidf = tfidf_vectorizer.get_feature_names_out()

# Display the TF-IDF matrix
print("\nTF-IDF Matrix:\n", X_tfidf.toarray())

# Display vocabulary (words with corresponding TF-IDF scores)
print("\nTF-IDF Vocabulary:", vocabulary_tfidf)



TF-IDF Matrix:
 [[0.35355339 0.35355339 0.35355339 0.35355339 0.35355339 0.35355339
  0.35355339 0.35355339]]

TF-IDF Vocabulary: ['brown' 'dog' 'example' 'fox' 'jumped' 'lazy' 'quick' 'sentence']
