In [None]:
! pip install nltk==3.9.1



In [None]:
!python -m spacy download en_core_web_sm

### Tokenization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
# Sample text for tokenization
text = "Tokenization is the process of breaking text into smaller units, such as words, phrases, or symbols, known as tokens."

# Tokenize the text into words
tokens = word_tokenize(text)

# Print the tokens
print(tokens)


['Tokenization', 'is', 'the', 'process', 'of', 'breaking', 'text', 'into', 'smaller', 'units', ',', 'such', 'as', 'words', ',', 'phrases', ',', 'or', 'symbols', ',', 'known', 'as', 'tokens', '.']


### Stopword Removal

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK stopwords dataset
nltk.download('stopwords')
nltk.download('punkt')

# Sample text
text = "NLTK is a leading platform for building Python programs to work with human language data."

# Tokenize the text
tokens = word_tokenize(text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Join filtered tokens back into a sentence
filtered_text = ' '.join(filtered_tokens)

print("Original text:")
print(text)
print("\nText after stopword removal:")
print(filtered_text)


Original text:
NLTK is a leading platform for building Python programs to work with human language data.

Text after stopword removal:
NLTK leading platform building Python programs work human language data .


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### NLP Normalization

In [None]:
import re

def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Example usage
original_text = "This is a sample text, with punctuation and  extra  spaces !"
normalized_text = normalize_text(original_text)
print("Original text:", original_text)
print("Normalized text:", normalized_text)


Original text: This is a sample text, with punctuation and  extra  spaces !
Normalized text: this is a sample text with punctuation and extra spaces


### Rule-based POS Tagging

In [None]:
import nltk

sentence = "The quick brown fox jumps over the lazy dog."
tokens = nltk.word_tokenize(sentence)

tagged_words = []
for word in tokens:
    if word.lower() in ['the', 'a', 'an']:
        tagged_words.append((word, 'DET'))
    elif word.endswith('s'):
        tagged_words.append((word, 'NOUN'))
    else:
        tagged_words.append((word, 'NN'))

print(tagged_words)


[('The', 'DET'), ('quick', 'NN'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'NOUN'), ('over', 'NN'), ('the', 'DET'), ('lazy', 'NN'), ('dog', 'NN'), ('.', 'NN')]


### Probabilistic POS Tagging

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

sentence = "The quick brown fox jumps over the lazy dog."
tokens = nltk.word_tokenize(sentence)

tagged_words = nltk.pos_tag(tokens)
print(tagged_words)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


### Rule-based NER

In [None]:
import re

def rule_based_ner(text):
    entities = []
    for match in re.finditer(r'\b[A-Z][a-z]*\b', text):
        entities.append((match.group(0), 'ORGANIZATION'))
    return entities

text = "Apple Inc. is headquartered in Cupertino, California."
entities = rule_based_ner(text)
print(entities)


[('Apple', 'ORGANIZATION'), ('Inc', 'ORGANIZATION'), ('Cupertino', 'ORGANIZATION'), ('California', 'ORGANIZATION')]


### Machine Learning-based NER

In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def ml_based_ner(text):
    tokens = nltk.word_tokenize(text)
    tagged_words = nltk.pos_tag(tokens)
    entities = nltk.chunk.ne_chunk(tagged_words)
    return entities

text = "Apple Inc. is headquartered in Cupertino, California."
entities = ml_based_ner(text)
print(entities)


(S
  (PERSON Apple/NNP)
  (ORGANIZATION Inc./NNP)
  is/VBZ
  headquartered/VBN
  in/IN
  (GPE Cupertino/NNP)
  ,/,
  (GPE California/NNP)
  ./.)


### NER evaluation metrics and challenges

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Example NER system outputs and ground truth labels
predicted_labels = ['O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'I-LOC']
true_labels = ['O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'I-LOC']

# Calculate precision, recall, and F1-score
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Precision: 1.0
Recall: 1.0
F1-score: 1.0


### N-gram Models

In [None]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import reuters

# Download NLTK resources (if not already downloaded)
nltk.download('reuters')
nltk.download('punkt')

# Load and tokenize text data (e.g., from the Reuters corpus)
reuters_text = reuters.raw()
tokens = word_tokenize(reuters_text.lower())

# Define function to generate N-grams of given order
def generate_ngrams(token_list, n):
    return list(ngrams(token_list, n))

# Example usage:
# Unigrams (N=1)
unigrams = generate_ngrams(tokens, 1)

# Bigrams (N=2)
bigrams = generate_ngrams(tokens, 2)

# Trigrams (N=3)
trigrams = generate_ngrams(tokens, 3)

# N-grams (N>3)
n = 4
ngrams = generate_ngrams(tokens, n)

# Print example outputs
print("Example Unigrams (N=1):", unigrams[:5])
print("Example Bigrams (N=2):", bigrams[:5])
print("Example Trigrams (N=3):", trigrams[:5])
print(f"Example {n}-grams (N={n}):", ngrams[:5])


[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Example Unigrams (N=1): [('asian',), ('exporters',), ('fear',), ('damage',), ('from',)]
Example Bigrams (N=2): [('asian', 'exporters'), ('exporters', 'fear'), ('fear', 'damage'), ('damage', 'from'), ('from', 'u.s.-japan')]
Example Trigrams (N=3): [('asian', 'exporters', 'fear'), ('exporters', 'fear', 'damage'), ('fear', 'damage', 'from'), ('damage', 'from', 'u.s.-japan'), ('from', 'u.s.-japan', 'rift')]
Example 4-grams (N=4): [('asian', 'exporters', 'fear', 'damage'), ('exporters', 'fear', 'damage', 'from'), ('fear', 'damage', 'from', 'u.s.-japan'), ('damage', 'from', 'u.s.-japan', 'rift'), ('from', 'u.s.-japan', 'rift', 'mounting')]


### Word2Vec

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import reuters
from nltk import sent_tokenize

# Load and tokenize text data (e.g., from the Reuters corpus)
reuters_text = reuters.raw()
sentences = [word_tokenize(word.lower()) for word in sent_tokenize(reuters_text)]

# Train Word2Vec model using Skip-gram architecture
word2vec_model = Word2Vec(sentences, sg=1, vector_size=100, window=5, min_count=5)

# Get vector representation of a word
vector = word2vec_model.wv['finance']
print("Vector representation of 'finance':", vector)


Vector representation of 'finance': [ 0.00555959 -0.13418159 -0.07054096  0.2715022  -0.46067762 -0.57456183
  0.36883485  0.07910629 -0.49893373 -0.45637354 -0.10879581  0.01425171
 -0.12155091  0.49947003 -0.20474729 -0.02343808  0.20650081  0.07367455
 -0.01277893 -0.10894115 -0.17248821 -0.03425761  0.31837705 -0.3576945
 -0.57092845 -0.21395148  0.19104598 -0.21550281 -0.6066371   0.08843739
 -0.05022264  0.38166136  0.37446606 -0.359757   -0.22131883  0.11892209
 -0.03559355 -0.42578197 -1.0066364   0.2839204   0.23510082 -0.23057513
 -0.3144356   0.25781754  0.7809881  -0.05510634  0.7119717  -0.09271256
  0.00404422  0.38225052  0.03035207 -0.25748873 -0.0915408  -0.14833567
 -0.29154584  0.25353318 -0.04460439  0.2662932  -0.16877227 -0.12203691
  0.26362306  0.1239363   0.5448731  -0.06204756 -0.4014762   0.59350646
 -0.0728      0.09463657  0.34893999  0.5589005   0.17783362  0.22128652
 -0.14458849  0.05690804  0.4040991  -0.09201758 -0.17391795  0.15579928
 -0.01704998  0.

### GloVe

In [None]:
import numpy as np

# Define the path to your downloaded GloVe file (adjust filename and dimensionality)
glove_file = "/content/glove.twitter.27B.25d.txt"

# Create a dictionary to store word vectors
word_vectors = {}

# Read the GloVe file and populate the dictionary
with open(glove_file, encoding="utf8") as file:
  for line in file:
    line_split = line.split()
    word = line_split[0]
    vector = np.array([float(val) for val in line_split[1:]])
    word_vectors[word] = vector

# Example usage: Find the vector representation of a word
word = "king"
if word in word_vectors:
  word_vector = word_vectors[word]
  print(f"Word: {word}, Vector Shape: {word_vector.shape}")
  # Access individual vector components (example: first 10 elements)
  print(f"First 10 elements: {word_vector[:10]}")
else:
  print(f"Word '{word}' not found in vocabulary")

# Example usage: Calculate word similarity using cosine similarity
word1 = "king"
word2 = "queen"
if word1 in word_vectors and word2 in word_vectors:
  vector1 = word_vectors[word1]
  vector2 = word_vectors[word2]
  similarity = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
  print(f"Similarity between '{word1}' and '{word2}': {similarity:.4f}")
else:
  print(f"At least one word not found in vocabulary")


Word: king, Vector Shape: (25,)
First 10 elements: [-0.74501 -0.11992  0.37329  0.36847 -0.4472  -0.2288   0.70118  0.82872
  0.39486 -0.58347]
Similarity between 'king' and 'queen': 0.9202


 ### TF-IDF NLP

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus of documents
corpus = [
    'The cat sat on the mat.',
    'The dog jumped over the fence.',
    'The cat and the dog are best friends.'
]

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit-transform the corpus to calculate TF-IDF scores
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Get feature names (words) from the vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# Print TF-IDF scores for each word in the corpus
for i, doc in enumerate(corpus):
    print(f"Document {i+1}:")
    for j, word in enumerate(feature_names):
        tfidf_score = tfidf_matrix[i, j]
        if tfidf_score > 0:
            print(f"   {word}: {tfidf_score:.4f}")


Document 1:
   cat: 0.3410
   mat: 0.4484
   on: 0.4484
   sat: 0.4484
   the: 0.5297
Document 2:
   dog: 0.3410
   fence: 0.4484
   jumped: 0.4484
   over: 0.4484
   the: 0.5297
Document 3:
   and: 0.3907
   are: 0.3907
   best: 0.3907
   cat: 0.2971
   dog: 0.2971
   friends: 0.3907
   the: 0.4615


### Syntax and syntactic analysis in NLP

In [None]:
import spacy

# Load spaCy English model
nlp = spacy.load('en_core_web_sm')

# Sample text for syntactic analysis
text = "The cat sat on the mat."

# Perform syntactic analysis
doc = nlp(text)

# Print dependency parse tree
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])


The det cat NOUN []
cat nsubj sat VERB [The]
sat ROOT sat VERB [cat, on, .]
on prep sat VERB [mat]
the det mat NOUN []
mat pobj on ADP [the]
. punct sat VERB []


### Dependency parsing vs. constituency parsing

In [None]:
import spacy

# Load spaCy English model
nlp = spacy.load('en_core_web_sm')

# Sample text for parsing
text = "The cat sat on the mat."

# Perform dependency parsing
doc_dep = nlp(text)
print("Dependency Parsing:")
for token in doc_dep:
    print(token.text, token.dep_, token.head.text)

# Perform constituency parsing (using spaCy's noun chunks)
doc_const = nlp(text)
print("\nConstituency Parsing:")
for chunk in doc_const.noun_chunks:
    print(chunk.text)


Dependency Parsing:
The det cat
cat nsubj sat
sat ROOT sat
on prep sat
the det mat
mat pobj on
. punct sat

Constituency Parsing:
The cat
the mat


### Word embeddings and semantic similarity


In [None]:
import gensim.downloader as api

# Load pre-trained Word2Vec model
model = api.load("word2vec-google-news-300")

# Example usage: Finding similar words
word = "car"
similar_words = model.most_similar(word)
print(f"Words similar to '{word}':")
for similar_word, similarity_score in similar_words:
    print(f"{similar_word}: {similarity_score}")

print("=" * 35)
# Example usage: Word vector arithmetic
word1 = "king"
word2 = "man"
word3 = "woman"

# Compute semantic similarity
similarity1_2 = model.similarity(word1, word2)
similarity1_3 = model.similarity(word1, word3)

# Print results
print(f"Semantic similarity between '{word1}' and '{word2}': {similarity1_2}")
print(f"Semantic similarity between '{word1}' and '{word3}': {similarity1_3}")


Words similar to 'car':
vehicle: 0.7821096181869507
cars: 0.7423831224441528
SUV: 0.7160962224006653
minivan: 0.6907036900520325
truck: 0.6735789775848389
Car: 0.6677608489990234
Ford_Focus: 0.667320191860199
Honda_Civic: 0.6626849174499512
Jeep: 0.651133120059967
pickup_truck: 0.6441438794136047
Semantic similarity between 'king' and 'man': 0.22942672669887543
Semantic similarity between 'king' and 'woman': 0.1284797340631485


### Sentiment analysis

In [None]:
! pip install spacy ==3.7.5
! pip install spacytextblob ==5.0.0

In [None]:
! python -m spacy download en_core_web_sm

In [None]:
# Import necessary libraries
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

# Load the English language model
nlp = spacy.load('en_core_web_sm')

# Add the TextBlob extension to the pipeline
nlp.add_pipe('spacytextblob')


text = "good product"
doc = nlp(text)

# Print the sentiment scores
print('text:', text)
print('Polarity:', doc._.blob.polarity)
print('Subjectivity:', doc._.blob.subjectivity)
print("=" * 35)

# Process some text
text = "bad product"
doc = nlp(text)

# Print the sentiment scores
print('text:', text)
print('Polarity:', doc._.blob.polarity)
print('Subjectivity:', doc._.blob.subjectivity)


text: good product
Polarity: 0.7
Subjectivity: 0.6000000000000001
text: bad product
Polarity: -0.6999999999999998
Subjectivity: 0.6666666666666666
