In [None]:
!pip install nltk  #natular language tool kit library offers prebuilt algori , datasets,and lot of module as below etc
# Major Components of NLTK
# Module	Description
# nltk.tokenize	Splits text into sentences or words.
# nltk.corpus	Access to large collections of text (like stopwords, movie reviews, names, etc.).
# nltk.stem	Performs stemming (reduces words to their root form).
# nltk.tag	Part-of-speech tagging for words.
# nltk.chunk	Named entity recognition and phrase chunking.
# nltk.parse	Parsing text using grammar rules.
# nltk.classify	Tools for text classification.
# nltk.metrics	Functions to evaluate NLP models.
# nltk.probability	Language modeling, frequency distributions.
# nltk.sem	Tools for representing and processing meaning (semantics).



In [None]:
!pip install scikit-learn #library



In [None]:
# 1 TOKENIZATION
# "I LIKE IBM SESSION" -> ["I", "LIKE", "IBM", "SESSION"]
# Each word is referred to as Tokens - 4

from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+') #\w+ wont considee spaces, punctution etc , as  it didn consider  fullstop while tokenizing

text = "I LIKE IBM SESSION!"
print(text.split())
tokens = tokenizer.tokenize(text)

print(text)
print(tokens)


['I', 'LIKE', 'IBM', 'SESSION!']
I LIKE IBM SESSION!
['I', 'LIKE', 'IBM', 'SESSION']


In [None]:
# 2 stopwords removal

# The purpose of this code is to remove stopwords from a list of tokens.
# Stopwords are common words (like "the", "is", "in") that are usually filtered out in NLP tasks.

# Example input:  [Yesterday I went to shopping inorder to buy an Iphone]
# Expected output: [Yesterday, went, shopping, inorder, buy, Iphone]

from nltk.corpus import stopwords  # Import the stopwords list from NLTK
import nltk  # Import the NLTK library

nltk.download('stopwords')  # Download the stopwords data (only needs to be done once)

tokens = ['I', 'LIKE', 'IBM', 'SESSION']  # List of words (tokens) to process
stop_words = set(stopwords.words('english'))  # Create a set of English stopwords

# List comprehension to filter out stopwords:
# For each word in tokens, if its lowercase form is not in stop_words, keep it.
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

print(filtered_tokens)  # Output the filtered list of tokens (stopwords removed)


['LIKE', 'IBM', 'SESSION']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# stemming
# Converting the words to their root form - Stemming process

from nltk.stem import PorterStemmer  # Import the PorterStemmer from NLTK

stemmer = PorterStemmer()  # Create a stemmer object
words = ["playing", "played", "plays"]  # List of words to stem

# List comprehension to stem each word in the list
# For each 'word' in 'words', apply 'stemmer.stem(word)' and collect the result in a new list called 'stems'
stems = [stemmer.stem(word) for word in words]

print(stems)  # Output: ['play', 'play', 'play']


['play', 'play', 'play']


In [None]:
# 4 LEMMATIZATION
# Convert the words to their dictionary (base) form using lemmatization.
# Example: "Better" --> "good", "Running" --> "run"

from nltk.stem import WordNetLemmatizer  # Import WordNetLemmatizer from NLTK
import nltk  # Import the NLTK library

nltk.download('wordnet')    # Download the WordNet data needed for lemmatization
nltk.download('omw-1.4')    # Download the Open Multilingual Wordnet data (for better coverage)

lemmatizer = WordNetLemmatizer()  # Create a lemmatizer object

# Lemmatize the word "better" as an adjective ('a'), returns its dictionary form "good"
print(lemmatizer.lemmatize("better", pos="a"))

# Lemmatize the word "running" as a verb ('v'), returns its dictionary form "run"
print(lemmatizer.lemmatize("running", pos="v"))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


good
run


In [None]:
# 5 PART OF SPEECH TAGGING
# This code identifies the part of speech (POS) for each word in a sentence.
# Example: "run" --> VERB, "apple" --> NOUN, "beautiful" --> ADJECTIVE

from nltk.tokenize import TreebankWordTokenizer  # Import tokenizer for splitting sentences into words
import nltk  # Import NLTK library

nltk.download('averaged_perceptron_tagger_eng')  # Download POS tagging model
from nltk import pos_tag  # Import POS tagger

tokenizer = TreebankWordTokenizer()  # Create a tokenizer object

sentence = "I like to play football"  # Input sentence
tokens = tokenizer.tokenize(sentence)  # Split the sentence into tokens (words)

# Assign part of speech tags to each token
# pos_tag returns a list of tuples: (word, POS tag)
pos_tags = pos_tag(tokens)

print("TOKENS :", tokens)      # Output the list of tokens
print("POS_TAGS :", pos_tags)  # Output the list of (token, POS) pairs


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


TOKENS : ['I', 'like', 'to', 'play', 'football']
POS_TAGS : [('I', 'PRP'), ('like', 'VBP'), ('to', 'TO'), ('play', 'VB'), ('football', 'NN')]


In [None]:
# 6 NAMED ENTITY RECOGNITION (NER)
# This code identifies and classifies named entities (like people, places, organizations) in text.
# Example:
#   "I BOUGHT A APPLE COMPUTER" --> 'APPLE' as ORGANISATION
#   "I BOUGHT AN APPLE FROM THE MARKET TO MAKE JUICE" --> 'APPLE' as FRUIT

import spacy  # Import the spaCy library for NLP

# Load the small English model for spaCy, which includes NER capabilities
nlp = spacy.load("en_core_web_sm")

# Input sentence for entity recognition
sentence = nlp("Barack Obama was born in Hawaii.")

# Iterate over detected entities in the sentence
for ent in sentence.ents:
    # Print the entity text and its label (type)
    print((ent.text, ent.label_))

# Output:
# ('Barack Obama', 'PERSON')
# ('Hawaii', 'GPE')

# Explanation:
# - 'Barack Obama' is recognized as a PERSON.
# - 'Hawaii' is recognized as a GPE (Geo-Political Entity, such as a country, state, or city).


('Barack Obama', 'PERSON')
('Hawaii', 'GPE')


In [None]:
# 1. BAG OF WORDS
# This code demonstrates the Bag of Words technique, which converts text into numerical feature vectors.
# Each unique word in the corpus becomes a feature (a column), and each sentence/document is represented as a vector
# indicating the presence (1) or absence (0) of each word.

from sklearn.feature_extraction.text import CountVectorizer  # Import CountVectorizer from scikit-learn

texts = ["I LIKE FOOTBALL", "FOOTBALL IS GREAT"]  # Input sentences/documents

vectorizer = CountVectorizer()  # Create a CountVectorizer object

x = vectorizer.fit_transform(texts)  # Learn the vocabulary and transform the texts into vectors

# Get the list of feature names (unique words found, sorted alphabetically)
print(vectorizer.get_feature_names_out())  # Output: ['football' 'great' 'is' 'like']

# Convert the sparse matrix to a dense array and print it
print(x.toarray())  # Output: [[1 0 0 1] [1 1 1 0]]


['football' 'great' 'is' 'like']
[[1 0 0 1]
 [1 1 1 0]]


In [None]:
# 2. TF-IDF (Term Frequency-Inverse Document Frequency)
# This code converts text data into a matrix of TF-IDF scores.
# TF-IDF reflects how important a word is to a document in the context of the entire dataset.
# Words that appear frequently in one document but not in others get higher scores.

from sklearn.feature_extraction.text import TfidfVectorizer  # Import TfidfVectorizer

texts = ["I love NLP", "NLP is great"]  # Input sentences/documents

tfidf = TfidfVectorizer()  # Create a TfidfVectorizer object

x = tfidf.fit_transform(texts)  # Learn the vocabulary and transform the texts into TF-IDF vectors

# Print the unique words (features) found in the data, sorted alphabetically
print(tfidf.get_feature_names_out())  # Output: ['great' 'is' 'love' 'nlp']

# Print the TF-IDF matrix for each sentence
print(x.toarray())
# Output:
# [[0.         0.         0.81480247 0.57973867]
#  [0.6316672  0.6316672  0.         0.44943642]]

# Explanation:
# Each value represents the importance of a word in a sentence.
# For the first sentence ("I love NLP"):
#   - 'love' and 'nlp' have higher values, meaning they are more important in this sentence.
# For the second sentence ("NLP is great"):
#   - 'great', 'is', and 'nlp' have nonzero values, showing their importance in this context.


['great' 'is' 'love' 'nlp']
[[0.         0.         0.81480247 0.57973867]
 [0.6316672  0.6316672  0.         0.44943642]]


In [7]:
# 3. WORD2VEC - Convert words into vectors (using pre-trained embeddings)
# Word2Vec represents each word as a vector of numbers, capturing semantic meaning.
# Example: king - man + woman ≈ queen

# If gensim is not installed, uncomment the next line:
# !pip install gensim

import gensim.downloader as api  # Import downloader for pre-trained models

# Load the pre-trained Word2Vec model (Google News vectors)
model = api.load("word2vec-google-news-300")

# Find the word most similar to 'king' + 'woman' - 'man'
similar = model.most_similar(positive=["king", "woman"], negative=["man"])

print(similar[:1])  # Output: [('queen', similarity_score)]


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [10]:
!pip install transformers
#step1



              Transformeres
              

In [8]:
#2
from transformers import AutoTokenizer  # Import the AutoTokenizer class from Hugging Face Transformers

# Load the pre-trained GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [9]:
#3. TOKENIZATION USING TRANSFORMERS (GPT-2)
# This code demonstrates how to tokenize a sentence using a pre-trained transformer tokenizer (GPT-2).
# Each word or subword in the input is converted to a unique token ID used by the model.
# You can also decode each token ID back to its text representation.

from transformers import AutoTokenizer  # Import the AutoTokenizer class from Hugging Face Transformers

# Load the pre-trained GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Tokenize the input sentence and get the token IDs as PyTorch tensors
ids = tokenizer("It was a dark and stormy", return_tensors="pt").input_ids

# For each token ID in the first (and only) sentence
for t in ids[0]:
    # Print the token ID, a tab, and the decoded text for that token
    print(t, "\t:", tokenizer.decode(t))

# Explanation:
# - The input sentence is split into tokens as understood by the GPT-2 model.
# - Each token is mapped to a unique integer ID (used internally by the model).
# - Some words (like "stormy") may be split into multiple tokens ("storm", "y") due to the subword tokenization approach.
# - Decoding each token ID gives the corresponding text fragment.
#

tensor(1026) 	: It
tensor(373) 	:  was
tensor(257) 	:  a
tensor(3223) 	:  dark
tensor(290) 	:  and
tensor(6388) 	:  storm
tensor(88) 	: y


In [12]:
!pip install --upgrade transformers




In [13]:
# 4. LOADING THE GPT-2 MODEL AND PROCESSING INPUT
# This code loads the pre-trained GPT-2 language model and processes your tokenized input.
# It demonstrates how the model handles input sequences and produces output logits (prediction scores).

from transformers import AutoModelForCausalLM  # Import the GPT-2 model class

# Load the pre-trained GPT-2 model
# pad_token_id=tokenizer.eos_token_id sets the padding token to the end-of-sequence token
# This is needed because GPT-2 doesn't have a default padding token
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

# Pass the tokenized input (ids) to the model to get the outputs
outputs = gpt2(ids)

# Print the shape of the output logits
print(outputs.logits.shape)  # Output: torch.Size([1, sequence_length, 50257])

# Explanation of padding with example:
# If you have sentences of different lengths in a batch:
# - "I like NLP" → 3 tokens
# - "I like to play video games" → 6 tokens
#
# The model needs all inputs to be the same length, so padding is added:
# - "I like NLP [PAD] [PAD] [PAD]" → 6 tokens (padded to match longest)
# - "I like to play video games" → 6 tokens (no padding needed)
#
# This allows the model to process multiple sentences together efficiently.

# What the output means:
# outputs.logits.shape = [batch_size, sequence_length, vocab_size]
# - batch_size: number of input sequences (1 in this case)
# - sequence_length: number of tokens in your input sentence
# - vocab_size: size of the model's vocabulary (50,257 for GPT-2)
#
# Each value in logits represents the model's confidence score for each possible token
# at each position in the sequence.


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

torch.Size([1, 7, 50257])


In [14]:
# 5. PREDICT THE NEXT WORDS USING GPT-2 GENERATE FUNCTION
# This code uses the GPT-2 model to generate text continuation for your input sentence.
# The model predicts and appends new tokens (words/subwords) to the original input.

# Generate text: the model will predict up to 20 new tokens after your input
output_ids = gpt2.generate(ids, max_new_tokens=20)

# Print the generated token IDs
print(output_ids)

# Decode the generated token IDs back into readable text
print(tokenizer.decode(output_ids[0]))

# Explanation:
# - gpt2.generate(ids, max_new_tokens=20) takes your input tokens and generates up to 20 more tokens as a continuation.
# - output_ids contains both your original input and the newly generated tokens.
# - tokenizer.decode(output_ids[0]) converts the sequence of token IDs back into a human-readable sentence.
# - The model continues your story, creating new sentences or phrases that follow your prompt.
#
# Example output:
# 'It was a dark and stormy night. The wind was blowing, and the clouds were falling. The wind was blowing, and the'
#
# This shows how GPT-2 can be used to generate creative text based on your input.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


tensor([[ 1026,   373,   257,  3223,   290,  6388,    88,  1755,    13,   383,
          2344,   373, 19280,    11,   290,   262, 15114,   547,  7463,    13,
           383,  2344,   373, 19280,    11,   290,   262]])
It was a dark and stormy night. The wind was blowing, and the clouds were falling. The wind was blowing, and the
