In [10]:
# Lemmatizer
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# create an object of class WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("playing", 'v'))
print(lemmatizer.lemmatize("coagulation",'v'))

play
coagulation


[nltk_data] Downloading package wordnet to /Users/ram/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
# Normalization
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Sample text
text="Donal Trump has issued a flurry of executive orders on everything from immigration, climate to pardons after being sworn in as the 47th President of United States."
# Lowercasing
text_lower = text.lower()
print("Lowercased text:", text_lower)

# Removing punctuation
text_no_punct = re.sub(r'[^\w\s]', '', text_lower)
print("Text without punctuation:", text_no_punct)

# Tokenization
words = nltk.word_tokenize(text_no_punct)
print("Tokenized words:", words)

# Removing stop words
stop_words = set(stopwords.words('english'))
words_no_stop = [word for word in words if word not in stop_words]
print("Text without stopwords:", words_no_stop)

# Stemming
ps = PorterStemmer()
words_stemmed = [ps.stem(word) for word in words_no_stop]
print("Stemmed words:", words_stemmed)

# Lemmatization
lemmatizer = WordNetLemmatizer()
words_lemmatized = [lemmatizer.lemmatize(word) for word in words_no_stop]
print("Lemmatized words:", words_lemmatized)

Lowercased text: donal trump has issued a flurry of executive orders on everything from immigration, climate to pardons after being sworn in as the 47th president of united states.
Text without punctuation: donal trump has issued a flurry of executive orders on everything from immigration climate to pardons after being sworn in as the 47th president of united states
Tokenized words: ['donal', 'trump', 'has', 'issued', 'a', 'flurry', 'of', 'executive', 'orders', 'on', 'everything', 'from', 'immigration', 'climate', 'to', 'pardons', 'after', 'being', 'sworn', 'in', 'as', 'the', '47th', 'president', 'of', 'united', 'states']
Text without stopwords: ['donal', 'trump', 'issued', 'flurry', 'executive', 'orders', 'everything', 'immigration', 'climate', 'pardons', 'sworn', '47th', 'president', 'united', 'states']
Stemmed words: ['donal', 'trump', 'issu', 'flurri', 'execut', 'order', 'everyth', 'immigr', 'climat', 'pardon', 'sworn', '47th', 'presid', 'unit', 'state']
Lemmatized words: ['donal',

[nltk_data] Downloading package punkt to /Users/ram/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ram/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ram/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
# POS-Tagging
from nltk import pos_tag
from nltk import word_tokenize

text = "Narendra Modi is Prime Minister of India."
tokenized_text = word_tokenize(text)
tags = tokens_tag = pos_tag(tokenized_text)
print(str(tags))

[('Narendra', 'NNP'), ('Modi', 'NNP'), ('is', 'VBZ'), ('Prime', 'NNP'), ('Minister', 'NNP'), ('of', 'IN'), ('India', 'NNP'), ('.', '.')]


In [22]:
# BoW Implementation
from sklearn.feature_extraction.text import CountVectorizer

# Sample text data
documents = [
	"Issac Newton discoverd gravity.",
	"People thought the celestial objects followed different rules than that of every day rules",
	"Redefining this understanding was curicial to development of Physics"
]

vectorizer = CountVectorizer()

# Fit the model and transform the documents into a Bag of Words
bow_matrix = vectorizer.fit_transform(documents)

# get the feature names (i.e., unique words in the corpus)
feature_names = vectorizer.get_feature_names_out()

# Convert the Bag of Words matrix into an array
bow_array = bow_matrix.toarray()

# Display the Bag of Words
print("Feature Names (Words): ", feature_names)
print("\nBag of Words Representation:")
print(bow_array)

Feature Names (Words):  ['celestial' 'curicial' 'day' 'development' 'different' 'discoverd'
 'every' 'followed' 'gravity' 'issac' 'newton' 'objects' 'of' 'people'
 'physics' 'redefining' 'rules' 'than' 'that' 'the' 'this' 'thought' 'to'
 'understanding' 'was']

Bag of Words Representation:
[[0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 1 0 1 0 1 1 0 0 0 1 1 1 0 0 2 1 1 1 0 1 0 0 0]
 [0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 0 1 1 1]]


In [26]:
# Tfidf implementation
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample text data
documents = [
	"Issac Newton discoverd gravity.",
	"People thought the celestial objects followed different rules than that of every day rules",
	"Redefining this understanding was curicial to development of Physics"
]

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(documents)

feature_names = vectorizer.get_feature_names_out()

tfidf_array = tfidf_matrix.toarray()

print("Feature names:",feature_names)
print("IF-IDF array:")
print(tfidf_array)

Feature names: ['celestial' 'curicial' 'day' 'development' 'different' 'discoverd'
 'every' 'followed' 'gravity' 'issac' 'newton' 'objects' 'of' 'people'
 'physics' 'redefining' 'rules' 'than' 'that' 'the' 'this' 'thought' 'to'
 'understanding' 'was']
IF-IDF array:
[[0.         0.         0.         0.         0.         0.5
  0.         0.         0.5        0.5        0.5        0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.        ]
 [0.25336031 0.         0.25336031 0.         0.25336031 0.
  0.25336031 0.25336031 0.         0.         0.         0.25336031
  0.19268705 0.25336031 0.         0.         0.50672062 0.25336031
  0.25336031 0.25336031 0.         0.25336031 0.         0.
  0.        ]
 [0.         0.34142622 0.         0.34142622 0.         0.
  0.         0.         0.         0.         0.         0.
  0.25966344 0.         0.34142622 0.34142622 0.         0.
  0.         0.         0.341

In [32]:
# N-grams
import nltk
from nltk.util import ngrams
from collections import Counter

# Sample text data
text = "Twilight is a mater of disguise who has ten thousand faces."

# Tokenize
tokens = nltk.word_tokenize(text)

# Generate Unigrams (1-gram)
unigrams = list(ngrams(tokens,1))
unigram_freq = Counter(unigrams)
print("Unigrams: ")
print(unigrams)
print("\nUnigram Frequencies: ")
print(unigram_freq)

# Generate Unigrams (1-gram)
bigrams = list(ngrams(tokens,2))
bigram_freq = Counter(bigrams)
print("Bigrams: ")
print(bigrams)
print("\nBigram Frequencies: ")
print(bigram_freq)

# Generate Unigrams (1-gram)
trigrams = list(ngrams(tokens,3))
trigram_freq = Counter(trigrams)
print("Trigrams: ")
print(trigrams)
print("\nTrigram Frequencies: ")
print(trigram_freq)

Unigrams: 
[('Twilight',), ('is',), ('a',), ('mater',), ('of',), ('disguise',), ('who',), ('has',), ('ten',), ('thousand',), ('faces',), ('.',)]

Unigram Frequencies: 
Counter({('Twilight',): 1, ('is',): 1, ('a',): 1, ('mater',): 1, ('of',): 1, ('disguise',): 1, ('who',): 1, ('has',): 1, ('ten',): 1, ('thousand',): 1, ('faces',): 1, ('.',): 1})
Bigrams: 
[('Twilight', 'is'), ('is', 'a'), ('a', 'mater'), ('mater', 'of'), ('of', 'disguise'), ('disguise', 'who'), ('who', 'has'), ('has', 'ten'), ('ten', 'thousand'), ('thousand', 'faces'), ('faces', '.')]

Bigram Frequencies: 
Counter({('Twilight', 'is'): 1, ('is', 'a'): 1, ('a', 'mater'): 1, ('mater', 'of'): 1, ('of', 'disguise'): 1, ('disguise', 'who'): 1, ('who', 'has'): 1, ('has', 'ten'): 1, ('ten', 'thousand'): 1, ('thousand', 'faces'): 1, ('faces', '.'): 1})
Trigrams: 
[('Twilight', 'is', 'a'), ('is', 'a', 'mater'), ('a', 'mater', 'of'), ('mater', 'of', 'disguise'), ('of', 'disguise', 'who'), ('disguise', 'who', 'has'), ('who', 'has',

In [38]:
# Word Embeddings from Word2Vec
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Sample text data
text = [
	"Issac Newton discoverd gravity and changed our understanding of physics.",
	"People thought the celestial objects followed different rules than that of every day rules of physics",
	"Redefining this understanding was curcial to development of Physics"
]

# Tokenize the sentences into words
tokenized_text = [word_tokenize(sentence.lower()) for sentence in text]

# Train a Word2Vec model on the tokenized text
model = Word2Vec(tokenized_text, vector_size=100, window=3, min_count=1, sg=0)

# Get word embeddings for a specific word
physics_vector = model.wv['physics']
print("Word Embeddings for 'physics': ")
print(physics_vector)

# Words most similar to physics
similar_words = model.wv.most_similar('physics',topn=5)
print("\nWords most similar to 'physics': ")
print(similar_words)

Word Embeddings for 'physics': 
[-8.6189089e-03  3.6706424e-03  5.1883548e-03  5.7437806e-03
  7.4650729e-03 -6.1647915e-03  1.1066109e-03  6.0496745e-03
 -2.8434854e-03 -6.1756647e-03 -4.1246202e-04 -8.3710104e-03
 -5.5971048e-03  7.1038795e-03  3.3552283e-03  7.2207232e-03
  6.8033212e-03  7.5318124e-03 -3.7981474e-03 -5.6852534e-04
  2.3434735e-03 -4.5170793e-03  8.3869314e-03 -9.8585887e-03
  6.7623998e-03  2.9124599e-03 -4.9319174e-03  4.3944260e-03
 -1.7434919e-03  6.7167045e-03  9.9692075e-03 -4.3728347e-03
 -6.0278189e-04 -5.7023047e-03  3.8443655e-03  2.7906538e-03
  6.8925591e-03  6.1067273e-03  9.5394002e-03  9.2687886e-03
  7.8987256e-03 -6.9909682e-03 -9.1602998e-03 -3.4999350e-04
 -3.0998802e-03  7.8903008e-03  5.9339083e-03 -1.5478234e-03
  1.5130843e-03  1.7931361e-03  7.8171734e-03 -9.5184054e-03
 -2.0762160e-04  3.4672646e-03 -9.4560062e-04  8.3853193e-03
  9.0160174e-03  6.5322290e-03 -7.1871141e-04  7.7075576e-03
 -8.5300328e-03  3.2060805e-03 -4.6301228e-03 -5.0918

In [42]:
# Contextual word embeddings using BERT
from transformers import BertModel, BertTokenizer
import torch

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Example sentence
sentence = 	"Issac Newton discoverd gravity and changed our understanding of physics."

# Tokenize the input sentence and conver tokens to tensor
input_ids = tokenizer.encode(sentence, return_tensors='pt')

# Pass the input through the BERT model to get embeddings
with torch.no_grad():
	outputs = model(input_ids)
	last_hidden_states = outputs.last_hidden_state

# Print the shape of the last hidden states tensor
print("shape of last hidden states: ", last_hidden_states.shape)

# Convert the embeddings into numpy array
embeddings = last_hidden_states.squeeze().numpy()

# Tokenize the sentence to match the embeddins to the word
tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze())

# Print the tokens and their corresponding contextual embeddings
for token, embedding in zip(tokens, embeddings):
	print(f"Token: {token}")
	print(f"Embedding: {embedding[:4]}...")
	print()

shape of last hidden states:  torch.Size([1, 15, 768])
Token: [CLS]
Embedding: [-0.75372785  0.21267381 -0.46011588  0.3534106 ]...

Token: iss
Embedding: [-1.0019723  -0.09209419  0.03805587 -1.1330551 ]...

Token: ##ac
Embedding: [ 0.04256701  0.7425329  -0.74694717 -0.48725405]...

Token: newton
Embedding: [ 0.3416494 -0.0373348 -0.9118464 -0.6818827]...

Token: discover
Embedding: [-1.1914814   0.9903251  -0.823267   -0.09535353]...

Token: ##d
Embedding: [-0.8980761   0.63520557 -0.4502475  -0.03369765]...

Token: gravity
Embedding: [ 0.0937224   0.56387097 -0.03989023 -0.763481  ]...

Token: and
Embedding: [-6.7916501e-01 -1.2992848e-01 -1.0779214e+00  8.6374022e-04]...

Token: changed
Embedding: [-0.05747977  0.09054674 -0.3289726   0.13091314]...

Token: our
Embedding: [ 0.3833255   0.37291035 -0.17615962  0.41101083]...

Token: understanding
Embedding: [ 0.17283192  1.0845681  -0.3424585  -0.18267825]...

Token: of
Embedding: [-0.19235352  0.9480484   0.10838635 -0.23513459]..

In [48]:
# NER
import spacy

# Load the pre-trained NLP model from spacy
nlp = spacy.load('en_core_web_sm')

# The sentence for which we want to perform NER
sentence = "Narendra Modi serverd for 3 consecutive terms as Prime Minister of India."

# Process the sentence using the NLP model
doc = nlp(sentence)

# Print the names entities recognized in the sentence
print("Named Entities in the sentence: ")
for ent in doc.ents:
	print(f"{ent.text}:{ent.label_}")

Named Entities in the sentence: 
Narendra Modi:PERSON
3:CARDINAL
India:GPE


In [77]:
# Sentence Generation using Bigram Model
import nltk
from collections import defaultdict, Counter
import random

# Sample text data (corpus)
corpus = [
	"I ate Sukuna finger",
	"Sukuna can control me",
	"Megumi tried to fight Sukuna",
	"Sukuna is really strong",
	"I fight with Sukuna to get control"
]

# Tokenize the sentences into words
tokenized_corpus = [nltk.word_tokenize(sentence.lower()) for sentence in corpus]

# Create bigrams from the tokenized corpus
bigrams = []
for sentence in tokenized_corpus:
    bigrams.extend(list(nltk.bigrams(sentence)))

# Calculate bigram frequencies
bigram_freq = defaultdict(Counter)
for w1,w2 in bigrams:
	bigram_freq[w1][w2] += 1

# Calculate bigram probabilities
bigram_prob = defaultdict(dict)
for w1 in bigram_freq:
	total_count = float(sum(bigram_freq[w1].values()))
	for w2 in bigram_freq[w1]:
		bigram_prob[w1][w2] = bigram_freq[w1][w2]/total_count

# Function to generate text using the bigram model
def generate_sentence(start_word, num_words=10):
    current_word = start_word
    sentence = [current_word]
    for _ in range (num_words - 1):
        if current_word in bigram_prob:
            next_word = random.choices(list(bigram_prob[current_word].keys()),list(bigram_prob[current_word].values()))[0]
            sentence.append(next_word)
            current_word = next_word
        else:
            break
    return ' '.join(sentence) 

# Generate a sentence starting with "i"
generated_sentence = generate_sentence("megumi", num_words=10)
print("Generated sentence: ", generated_sentence)

Generated sentence:  megumi tried to get control me


In [79]:
# implementing a simple transformer model*
import torch
import torch.nn as nn
import math

class TransformerModel(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, ffn_dim, max_seq_len, num_classes):
        super(TransformerModel, self).__init__()
        
        self.embedding = nn.Embedding(input_dim, model_dim)
        self.model_dim = model_dim
        self.layers = nn.ModuleList([
            TransformerLayer(model_dim, num_heads, ffn_dim)
            for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(model_dim, num_classes)
        
    def forward(self, x):
        seq_len = x.size(1)
        positional_encoding = self._generate_positional_encoding(seq_len)
        x = self.embedding(x) + positional_encoding
        for layer in self.layers:
            x = layer(x)
        return self.fc_out(x.mean(dim=1))
    
    def _generate_positional_encoding(self, seq_len):
        positional_encoding = torch.zeros(seq_len, self.model_dim)
        for pos in range(seq_len):
            for i in range(0, self.model_dim, 2):
                positional_encoding[pos, i] = math.sin(pos / (10000 ** (i / self.model_dim)))
                positional_encoding[pos, i + 1] = math.cos(pos / (10000 ** ((i + 1) / self.model_dim)))
        return positional_encoding.unsqueeze(0)

class TransformerLayer(nn.Module):
    def __init__(self, model_dim, num_heads, ffn_dim):
        super(TransformerLayer, self).__init__()
        self.multihead_attention = nn.MultiheadAttention(embed_dim=model_dim, num_heads=num_heads)
        self.norm1 = nn.LayerNorm(model_dim)
        self.ffn = nn.Sequential(
            nn.Linear(model_dim, ffn_dim),
            nn.ReLU(),
            nn.Linear(ffn_dim, model_dim)
        )
        self.norm2 = nn.LayerNorm(model_dim)
        
    def forward(self, x):
        attn_output, _ = self.multihead_attention(x, x, x)
        x = self.norm1(x + attn_output)
        ffn_output = self.ffn(x)
        x = self.norm2(x + ffn_output)
        return x

# Example usage:
input_dim = 10000  # Vocabulary size
model_dim = 512
num_heads = 8
num_layers = 6
ffn_dim = 2048
max_seq_len = 100
num_classes = 10

# Instantiate the model
model = TransformerModel(input_dim, model_dim, num_heads, num_layers, ffn_dim, max_seq_len, num_classes)

# Example input (batch_size=32, sequence_length=50)
x = torch.randint(0, input_dim, (32, 50))

# Forward pass
output = model(x)
print("Output shape:", output.shape)

Output shape: torch.Size([32, 10])
