<a href="https://colab.research.google.com/github/ericmedlock/Building-Conversational-Generative-AI-Apps-with-Langchain-and-GPT/blob/main/Chapter%2002/Chapter_2_Natural_Language_Processing_(NLP)_Fundamentals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install nltk==3.9.1



In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


### Tokenization

In [3]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
# Sample text for tokenization
text = "Tokenization is the process of breaking text into smaller units, such as words, phrases, or symbols, known as tokens."

# Tokenize the text into words
tokens = word_tokenize(text)

# Print the tokens
print(tokens)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['Tokenization', 'is', 'the', 'process', 'of', 'breaking', 'text', 'into', 'smaller', 'units', ',', 'such', 'as', 'words', ',', 'phrases', ',', 'or', 'symbols', ',', 'known', 'as', 'tokens', '.']


### Stopword Removal

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK stopwords dataset
nltk.download('stopwords')
nltk.download('punkt')

# Sample text
text = "NLTK is a leading platform for building Python programs to work with human language data."

# Tokenize the text
tokens = word_tokenize(text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Join filtered tokens back into a sentence
filtered_text = ' '.join(filtered_tokens)

print("Original text:")
print(text)
print("\nText after stopword removal:")
print(filtered_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Original text:
NLTK is a leading platform for building Python programs to work with human language data.

Text after stopword removal:
NLTK leading platform building Python programs work human language data .


### NLP Normalization

In [5]:
import re

def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Example usage
original_text = "This is a sample text, with punctuation and  extra  spaces !"
normalized_text = normalize_text(original_text)
print("Original text:", original_text)
print("Normalized text:", normalized_text)


Original text: This is a sample text, with punctuation and  extra  spaces !
Normalized text: this is a sample text with punctuation and extra spaces


### Rule-based POS Tagging

In [6]:
import nltk

sentence = "The quick brown fox jumps over the lazy dog."
tokens = nltk.word_tokenize(sentence)

tagged_words = []
for word in tokens:
    if word.lower() in ['the', 'a', 'an']:
        tagged_words.append((word, 'DET'))
    elif word.endswith('s'):
        tagged_words.append((word, 'NOUN'))
    else:
        tagged_words.append((word, 'NN'))

print(tagged_words)


[('The', 'DET'), ('quick', 'NN'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'NOUN'), ('over', 'NN'), ('the', 'DET'), ('lazy', 'NN'), ('dog', 'NN'), ('.', 'NN')]


### Probabilistic POS Tagging

In [8]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')


sentence = "The quick brown fox jumps over the lazy dog."
tokens = nltk.word_tokenize(sentence)

tagged_words = nltk.pos_tag(tokens)
print(tagged_words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


### Rule-based NER

In [None]:
import re

def rule_based_ner(text):
    entities = []
    for match in re.finditer(r'\b[A-Z][a-z]*\b', text):
        entities.append((match.group(0), 'ORGANIZATION'))
    return entities

text = "Apple Inc. is headquartered in Cupertino, California."
entities = rule_based_ner(text)
print(entities)


### Machine Learning-based NER

In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def ml_based_ner(text):
    tokens = nltk.word_tokenize(text)
    tagged_words = nltk.pos_tag(tokens)
    entities = nltk.chunk.ne_chunk(tagged_words)
    return entities

text = "Apple Inc. is headquartered in Cupertino, California."
entities = ml_based_ner(text)
print(entities)


### NER evaluation metrics and challenges

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Example NER system outputs and ground truth labels
predicted_labels = ['O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'I-LOC']
true_labels = ['O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'I-LOC']

# Calculate precision, recall, and F1-score
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


### N-gram Models

In [None]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import reuters

# Download NLTK resources (if not already downloaded)
nltk.download('reuters')
nltk.download('punkt')

# Load and tokenize text data (e.g., from the Reuters corpus)
reuters_text = reuters.raw()
tokens = word_tokenize(reuters_text.lower())

# Define function to generate N-grams of given order
def generate_ngrams(token_list, n):
    return list(ngrams(token_list, n))

# Example usage:
# Unigrams (N=1)
unigrams = generate_ngrams(tokens, 1)

# Bigrams (N=2)
bigrams = generate_ngrams(tokens, 2)

# Trigrams (N=3)
trigrams = generate_ngrams(tokens, 3)

# N-grams (N>3)
n = 4
ngrams = generate_ngrams(tokens, n)

# Print example outputs
print("Example Unigrams (N=1):", unigrams[:5])
print("Example Bigrams (N=2):", bigrams[:5])
print("Example Trigrams (N=3):", trigrams[:5])
print(f"Example {n}-grams (N={n}):", ngrams[:5])


### Word2Vec

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import reuters
from nltk import sent_tokenize

# Load and tokenize text data (e.g., from the Reuters corpus)
reuters_text = reuters.raw()
sentences = [word_tokenize(word.lower()) for word in sent_tokenize(reuters_text)]

# Train Word2Vec model using Skip-gram architecture
word2vec_model = Word2Vec(sentences, sg=1, vector_size=100, window=5, min_count=5)

# Get vector representation of a word
vector = word2vec_model.wv['finance']
print("Vector representation of 'finance':", vector)


### GloVe

In [None]:
import numpy as np

# Define the path to your downloaded GloVe file (adjust filename and dimensionality)
glove_file = "/content/glove.twitter.27B.25d.txt"

# Create a dictionary to store word vectors
word_vectors = {}

# Read the GloVe file and populate the dictionary
with open(glove_file, encoding="utf8") as file:
  for line in file:
    line_split = line.split()
    word = line_split[0]
    vector = np.array([float(val) for val in line_split[1:]])
    word_vectors[word] = vector

# Example usage: Find the vector representation of a word
word = "king"
if word in word_vectors:
  word_vector = word_vectors[word]
  print(f"Word: {word}, Vector Shape: {word_vector.shape}")
  # Access individual vector components (example: first 10 elements)
  print(f"First 10 elements: {word_vector[:10]}")
else:
  print(f"Word '{word}' not found in vocabulary")

# Example usage: Calculate word similarity using cosine similarity
word1 = "king"
word2 = "queen"
if word1 in word_vectors and word2 in word_vectors:
  vector1 = word_vectors[word1]
  vector2 = word_vectors[word2]
  similarity = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
  print(f"Similarity between '{word1}' and '{word2}': {similarity:.4f}")
else:
  print(f"At least one word not found in vocabulary")


 ### TF-IDF NLP

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus of documents
corpus = [
    'The cat sat on the mat.',
    'The dog jumped over the fence.',
    'The cat and the dog are best friends.'
]

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit-transform the corpus to calculate TF-IDF scores
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Get feature names (words) from the vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# Print TF-IDF scores for each word in the corpus
for i, doc in enumerate(corpus):
    print(f"Document {i+1}:")
    for j, word in enumerate(feature_names):
        tfidf_score = tfidf_matrix[i, j]
        if tfidf_score > 0:
            print(f"   {word}: {tfidf_score:.4f}")


### Syntax and syntactic analysis in NLP

In [None]:
import spacy

# Load spaCy English model
nlp = spacy.load('en_core_web_sm')

# Sample text for syntactic analysis
text = "The cat sat on the mat."

# Perform syntactic analysis
doc = nlp(text)

# Print dependency parse tree
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])


### Dependency parsing vs. constituency parsing

In [None]:
import spacy

# Load spaCy English model
nlp = spacy.load('en_core_web_sm')

# Sample text for parsing
text = "The cat sat on the mat."

# Perform dependency parsing
doc_dep = nlp(text)
print("Dependency Parsing:")
for token in doc_dep:
    print(token.text, token.dep_, token.head.text)

# Perform constituency parsing (using spaCy's noun chunks)
doc_const = nlp(text)
print("\nConstituency Parsing:")
for chunk in doc_const.noun_chunks:
    print(chunk.text)


### Word embeddings and semantic similarity


In [None]:
import gensim.downloader as api

# Load pre-trained Word2Vec model
model = api.load("word2vec-google-news-300")

# Example usage: Finding similar words
word = "car"
similar_words = model.most_similar(word)
print(f"Words similar to '{word}':")
for similar_word, similarity_score in similar_words:
    print(f"{similar_word}: {similarity_score}")

print("=" * 35)
# Example usage: Word vector arithmetic
word1 = "king"
word2 = "man"
word3 = "woman"

# Compute semantic similarity
similarity1_2 = model.similarity(word1, word2)
similarity1_3 = model.similarity(word1, word3)

# Print results
print(f"Semantic similarity between '{word1}' and '{word2}': {similarity1_2}")
print(f"Semantic similarity between '{word1}' and '{word3}': {similarity1_3}")


### Sentiment analysis

In [None]:
! pip install spacy ==3.7.5
! pip install spacytextblob ==5.0.0

In [None]:
! python -m spacy download en_core_web_sm

In [None]:
# Import necessary libraries
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

# Load the English language model
nlp = spacy.load('en_core_web_sm')

# Add the TextBlob extension to the pipeline
nlp.add_pipe('spacytextblob')


text = "good product"
doc = nlp(text)

# Print the sentiment scores
print('text:', text)
print('Polarity:', doc._.blob.polarity)
print('Subjectivity:', doc._.blob.subjectivity)
print("=" * 35)

# Process some text
text = "bad product"
doc = nlp(text)

# Print the sentiment scores
print('text:', text)
print('Polarity:', doc._.blob.polarity)
print('Subjectivity:', doc._.blob.subjectivity)
