<a href="https://colab.research.google.com/github/d0k7/Natural-Language-Processing-Lab-Code/blob/main/Natural_Language_Processing_Lab_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. Convert the text into tokens**

In [None]:
# Import the necessary library
import nltk
nltk.download('punkt')  # Download the punkt tokenizer

# Function to convert text into tokens
def text_to_tokens(text):
    # Tokenize the text using the nltk word tokenizer
    tokens = nltk.word_tokenize(text)

    return tokens

# Example text
example_text = "Convert this text into tokens using Python."

# Call the function to get tokens
result_tokens = text_to_tokens(example_text)

# Print the original text
print("Original Text:")
print(example_text)

# Print the resulting tokens
print("\nTokens:")
print(result_tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Original Text:
Convert this text into tokens using Python.

Tokens:
['Convert', 'this', 'text', 'into', 'tokens', 'using', 'Python', '.']


**2. Find the word frequency**

In [None]:
from collections import Counter

# Function to find word frequency
def word_frequency(tokens):
    # Convert all words to lowercase for case-insensitive counting
    lowercase_tokens = [word.lower() for word in tokens]

    # Use Counter to count the occurrences of each word in the list of lowercase tokens
    word_freq = Counter(lowercase_tokens)

    return word_freq

# Updated example tokens
example_tokens = ['Convert', 'this', 'text', 'into', 'tokens', 'using', 'Python', 'python', '.']

# Call the function to get word frequency
result_word_freq = word_frequency(example_tokens)

# Print the word frequency
print("Word Frequency:")
print(result_word_freq)


Word Frequency:
Counter({'python': 2, 'convert': 1, 'this': 1, 'text': 1, 'into': 1, 'tokens': 1, 'using': 1, '.': 1})


**3. Demonstrate a bigram language model**

In [None]:
from nltk import bigrams
from collections import defaultdict
import random

# Function to create a bigram language model
def create_bigram_model(tokens):
    bigram_model = defaultdict(list)

    # Generate bigrams from the list of tokens
    bigrams_list = list(bigrams(tokens))

    # Populate the bigram model with the next word for each current word
    for prev_word, next_word in bigrams_list:
        bigram_model[prev_word].append(next_word)

    return bigram_model

# Example tokens
example_tokens = ['Convert', 'this', 'text', 'into', 'tokens', 'using', 'Python', 'python', '.']

# Create a bigram language model
bigram_model = create_bigram_model(example_tokens)

# Function to generate text using the bigram model
def generate_text(bigram_model, start_word, length=5):
    generated_text = [start_word]

    # Generate text based on the bigram model
    for _ in range(length - 1):
        current_word = generated_text[-1]

        # Choose the next word randomly from the options in the bigram model
        next_word_options = bigram_model[current_word]
        if next_word_options:
            next_word = random.choice(next_word_options)
            generated_text.append(next_word)
        else:
            break

    return generated_text

# Generate text starting with a given word
start_word = 'Convert'
generated_text = generate_text(bigram_model, start_word, length=8)

# Print the generated text
print("Generated Text:")
print(" ".join(generated_text))


Generated Text:
Convert this text into tokens using Python python


**4. Demonstrate a trigram language model**

In [None]:
from nltk import trigrams
from collections import defaultdict
import random

# Function to create a trigram language model
def create_trigram_model(tokens):
    trigram_model = defaultdict(list)

    # Generate trigrams from the list of tokens
    trigrams_list = list(trigrams(tokens))

    # Populate the trigram model with the next word for each pair of current words
    for word1, word2, next_word in trigrams_list:
        trigram_model[(word1, word2)].append(next_word)

    return trigram_model

# Example tokens
example_tokens = ['Convert', 'this', 'text', 'into', 'tokens', 'using', 'Python', 'python', '.']

# Create a trigram language model
trigram_model = create_trigram_model(example_tokens)

# Function to generate text using the trigram model
def generate_text(trigram_model, start_words, length=5):
    generated_text = list(start_words)

    # Generate text based on the trigram model
    for _ in range(length - 2):
        current_words = tuple(generated_text[-2:])

        # Choose the next word randomly from the options in the trigram model
        next_word_options = trigram_model[current_words]
        if next_word_options:
            next_word = random.choice(next_word_options)
            generated_text.append(next_word)
        else:
            break

    return generated_text

# Generate text starting with two given words
start_words = ('Convert', 'this')
generated_text = generate_text(trigram_model, start_words, length=8)

# Print the generated text
print("Generated Text:")
print(" ".join(generated_text))


Generated Text:
Convert this text into tokens using Python python


**5. Generate regular expression for a given text**

In [None]:
import re

# Given text containing email addresses
text_with_emails = "Contact us at john.doe@example.com or jane.smith@email.com for more information."

# Regular expression for matching email addresses
email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'

# Find all matches in the text
matches = re.findall(email_regex, text_with_emails)

# Print the matches
print("Email Addresses:")
print(matches)


Email Addresses:
['john.doe@example.com', 'jane.smith@email.com']


**6. Perform Lemmatization**

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')  # Download the punkt tokenizer
nltk.download('wordnet')  # Download the WordNet database

# Function to perform lemmatization on a list of tokens
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return lemmatized_tokens

# Example text
example_text = "Convert this text into tokens and perform lemmatization on each token."

# Tokenize the text
tokens = word_tokenize(example_text)

# Perform lemmatization on the tokens
lemmatized_tokens = lemmatize_tokens(tokens)

# Print the original tokens and the lemmatized tokens
print("Original Tokens:")
print(tokens)
print("\nLemmatized Tokens:")
print(lemmatized_tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


Original Tokens:
['Convert', 'this', 'text', 'into', 'tokens', 'and', 'perform', 'lemmatization', 'on', 'each', 'token', '.']

Lemmatized Tokens:
['Convert', 'this', 'text', 'into', 'token', 'and', 'perform', 'lemmatization', 'on', 'each', 'token', '.']


**7. Perform Stemming**

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')  # Download the punkt tokenizer

# Function to perform stemming on a list of tokens
def stem_tokens(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    return stemmed_tokens

# Example text
example_text = "Convert this text into tokens and perform stemming on each token."

# Tokenize the text
tokens = word_tokenize(example_text)

# Perform stemming on the tokens
stemmed_tokens = stem_tokens(tokens)

# Print the original tokens and the stemmed tokens
print("Original Tokens:")
print(tokens)
print("\nStemmed Tokens:")
print(stemmed_tokens)


Original Tokens:
['Convert', 'this', 'text', 'into', 'tokens', 'and', 'perform', 'stemming', 'on', 'each', 'token', '.']

Stemmed Tokens:
['convert', 'thi', 'text', 'into', 'token', 'and', 'perform', 'stem', 'on', 'each', 'token', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**8. Identify parts-of Speech using Penn Treebank tag set.**

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')  # Download the punkt tokenizer
nltk.download('averaged_perceptron_tagger')  # Download the tagger model

# Example text
example_text = "Identify parts of speech using the Penn Treebank tag set."

# Tokenize the text
tokens = word_tokenize(example_text)

# Perform part-of-speech tagging using the Penn Treebank tag set
pos_tags = nltk.pos_tag(tokens)

# Print the original tokens and their part-of-speech tags
print("Original Tokens:")
print(tokens)
print("\nPart-of-Speech Tags (Penn Treebank):")
print(pos_tags)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...


Original Tokens:
['Identify', 'parts', 'of', 'speech', 'using', 'the', 'Penn', 'Treebank', 'tag', 'set', '.']

Part-of-Speech Tags (Penn Treebank):
[('Identify', 'NNP'), ('parts', 'NNS'), ('of', 'IN'), ('speech', 'NN'), ('using', 'VBG'), ('the', 'DT'), ('Penn', 'NNP'), ('Treebank', 'NNP'), ('tag', 'NN'), ('set', 'NN'), ('.', '.')]


[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


**9. Implement HMM for POS tagging**

In [None]:
import nltk
from nltk.corpus import brown
from nltk.tag import hmm

# Download the Brown corpus
nltk.download('brown')

# Get the tagged sentences from the Brown corpus
tagged_sentences = brown.tagged_sents(categories='news')

# Split the data into training and testing sets
train_size = int(0.8 * len(tagged_sentences))
train_data, test_data = tagged_sentences[:train_size], tagged_sentences[train_size:]

# Train the HMM POS tagger
hmm_tagger = hmm.HiddenMarkovModelTrainer().train(train_data)

# Evaluate the performance on the test set
accuracy = hmm_tagger.evaluate(test_data)
print(f"HMM POS Tagger Accuracy: {accuracy:.2%}")

# Example of tagging a new sequence
new_sequence = "Implement a Hidden Markov Model for Part-of-Speech tagging."
tokenized_sequence = nltk.word_tokenize(new_sequence)
predicted_tags = hmm_tagger.tag(tokenized_sequence)

# Print the predicted POS tags for the new sequence
print("\nPredicted POS Tags:")
print(predicted_tags)


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  accuracy = hmm_tagger.evaluate(test_data)


HMM POS Tagger Accuracy: 28.96%

Predicted POS Tags:
[('Implement', 'AT'), ('a', 'AT'), ('Hidden', 'AT'), ('Markov', 'AT'), ('Model', 'AT'), ('for', 'AT'), ('Part-of-Speech', 'AT'), ('tagging', 'AT'), ('.', 'AT')]


**10. Build a Chunker**

In [None]:
import nltk
from nltk.corpus import treebank_chunk

# Download the 'treebank' corpus
nltk.download('treebank')

# Get the training data from the Treebank chunk corpus
train_data = treebank_chunk.chunked_sents()

# Define a simple NP chunking grammar using regular expressions
chunking_grammar = r"""
    NP: {<DT|JJ|NN.*>+}          # Chunk sequences of determiner, adjective, and noun
"""

# Create a chunk parser using the defined grammar
chunk_parser = nltk.RegexpParser(chunking_grammar)

# Example sentence
sentence = "The quick brown fox jumps over the lazy dog."

# Tokenize the sentence
tokens = nltk.word_tokenize(sentence)

# POS tagging
pos_tags = nltk.pos_tag(tokens)

# Perform chunking
tree = chunk_parser.parse(pos_tags)

# Print the resulting tree in a text-based format
print("Chunked Tree:")
print(tree)


Chunked Tree:
(S
  (NP The/DT quick/JJ brown/NN fox/NN)
  jumps/VBZ
  over/IN
  (NP the/DT lazy/JJ dog/NN)
  ./.)


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


**11. Find the synonym of a word using WordNet**

In [None]:
from nltk.corpus import wordnet

# Function to get synonyms of a word using WordNet
def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return set(synonyms)

# Example word
word = "happy"

# Get synonyms of the word
synonyms = get_synonyms(word)

# Print the synonyms
print(f"Synonyms of '{word}':")
print(synonyms)


Synonyms of 'happy':
{'happy', 'well-chosen', 'glad', 'felicitous'}


**12. Find the antonym of a word**

In [None]:
from nltk.corpus import wordnet

# Function to get antonyms of a word using WordNet
def get_antonyms(word):
    antonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if lemma.antonyms():
                antonyms.append(lemma.antonyms()[0].name())
    return set(antonyms)

# Example word
word = "happy"

# Get antonyms of the word
antonyms = get_antonyms(word)

# Print the antonyms
print(f"Antonyms of '{word}':")
print(antonyms)


Antonyms of 'happy':
{'unhappy'}


**13. Implement semantic role labeling to identify named entities**

In [None]:
#!pip install -U spacy
#!python -m spacy download en_core_web_sm

import spacy

# Load the English spaCy model
nlp = spacy.load("en_core_web_sm")

# Example sentence
sentence = "Apple is considering opening a new store in Mumbai."

# Process the sentence with spaCy
doc = nlp(sentence)

# Print named entities
print("Named Entities:")
for ent in doc.ents:
    print(f"{ent.text}: {ent.label_}")

# Print semantic roles
print("\nSemantic Roles:")
for token in doc:
    print(f"{token.text}: {token.dep_} -> {token.head.text}")


Named Entities:
Apple: ORG
Mumbai: GPE

Semantic Roles:
Apple: nsubj -> considering
is: aux -> considering
considering: ROOT -> considering
opening: xcomp -> considering
a: det -> store
new: amod -> store
store: dobj -> opening
in: prep -> opening
Mumbai: pobj -> in
.: punct -> considering


**14. Resolve the ambiguity**

In [None]:
import nltk
from nltk.wsd import lesk
from nltk.tokenize import word_tokenize

# Download the WordNet data
nltk.download('wordnet')

# Example sentence with ambiguity
sentence = "I went to the bank to deposit some money."

# Tokenize the sentence
tokens = word_tokenize(sentence)

# Perform word sense disambiguation using Lesk algorithm
ambiguous_word = "bank"
synset = lesk(tokens, ambiguous_word)

# Print the sense and definition
if synset:
    print(f"Ambiguous Word: {ambiguous_word}")
    print(f"Selected Sense: {synset.name()}")
    print(f"Definition: {synset.definition()}")
else:
    print(f"No suitable sense found for the word '{ambiguous_word}'.")


Ambiguous Word: bank
Selected Sense: savings_bank.n.02
Definition: a container (usually with a slot in the top) for keeping money at home


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**15. Implement RNN for sequence labeling**

In [None]:
!pip install tensorflow numpy




In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Generate dummy data
# Each sentence is represented as a sequence of word indices, and each label is represented as a sequence of label indices
sentences = [
    [1, 2, 3, 4, 5],
    [6, 7, 8, 9],
    [10, 11, 12, 13, 14, 15],
]

labels = [
    [0, 1, 0, 1, 2],  # 0: O, 1: B-entity, 2: I-entity
    [0, 1, 0, 1],
    [0, 1, 2, 0, 1, 0],
]

# Pad sequences to have the same length
max_len = max(len(seq) for seq in sentences)
sentences_padded = tf.keras.preprocessing.sequence.pad_sequences(sentences, padding='post', maxlen=max_len)
labels_padded = tf.keras.preprocessing.sequence.pad_sequences(labels, padding='post', maxlen=max_len)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=16, output_dim=8, input_length=max_len))  # Adjust input_dim based on your vocabulary size
model.add(SimpleRNN(16, return_sequences=True))
model.add(Dense(3, activation='softmax'))  # Assuming 3 classes (O, B-entity, I-entity)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(sentences_padded, labels_padded, epochs=10, batch_size=1)

# Make predictions (you would typically use a validation set or test set)
predictions = model.predict(sentences_padded)

# Print predictions
for i in range(len(sentences)):
    print(f"Sentence: {sentences[i]}")
    print(f"Predicted Labels: {np.argmax(predictions[i], axis=-1)}")
    print(f"True Labels: {labels[i]}\n")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Sentence: [1, 2, 3, 4, 5]
Predicted Labels: [0 0 0 0 0 0]
True Labels: [0, 1, 0, 1, 2]

Sentence: [6, 7, 8, 9]
Predicted Labels: [0 0 0 0 0 0]
True Labels: [0, 1, 0, 1]

Sentence: [10, 11, 12, 13, 14, 15]
Predicted Labels: [0 0 0 0 0 0]
True Labels: [0, 1, 2, 0, 1, 0]



**16. Implement POS tagging using LSTM**

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

# Dummy data
sentences = [
    "The cat sat on the mat",
    "I love natural language processing",
    "POS tagging with LSTM is interesting",
]

pos_tags = [
    ["DT", "NN", "VBD", "IN", "DT", "NN"],
    ["PRP", "VBP", "JJ", "NN", "NN", "VBG"],
    ["NN", "VBG", "IN", "NN", "VBZ", "JJ"],
]

# Tokenize words and POS tags
tokenizer_words = Tokenizer(oov_token="<OOV>")
tokenizer_pos = Tokenizer()

tokenizer_words.fit_on_texts(sentences)
tokenizer_pos.fit_on_texts(pos_tags)

vocab_size_words = len(tokenizer_words.word_index) + 1
vocab_size_pos = len(tokenizer_pos.word_index) + 1

# Convert sentences and POS tags to sequences
sequences_words = tokenizer_words.texts_to_sequences(sentences)
sequences_pos = tokenizer_pos.texts_to_sequences(pos_tags)

# Pad sequences to have the same length
max_len = max(len(seq) for seq in sequences_words)
padded_sequences_words = pad_sequences(sequences_words, padding='post', maxlen=max_len)
padded_sequences_pos = pad_sequences(sequences_pos, padding='post', maxlen=max_len)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences_words, padded_sequences_pos, test_size=0.2, random_state=42)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size_words, output_dim=8, input_length=max_len, mask_zero=True))
model.add(LSTM(16, return_sequences=True))
model.add(Dense(vocab_size_pos, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=1, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Make predictions on new sentences
new_sentences = ["The dog barked loudly", "I enjoy reading books"]
new_sequences = tokenizer_words.texts_to_sequences(new_sentences)
new_padded_sequences = pad_sequences(new_sequences, padding='post', maxlen=max_len)
predictions = model.predict(new_padded_sequences)

# Print predictions
for i, sentence in enumerate(new_sentences):
    predicted_pos_tags = [tokenizer_pos.index_word[np.argmax(pred)] for pred in predictions[i]]
    print(f"Sentence: {sentence}")
    print(f"Predicted POS Tags: {predicted_pos_tags}\n")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 2.3012
Test Accuracy: 0.3333
Sentence: The dog barked loudly
Predicted POS Tags: ['nn', 'nn', 'nn', 'nn', 'nn', 'nn']

Sentence: I enjoy reading books
Predicted POS Tags: ['nn', 'nn', 'nn', 'nn', 'nn', 'nn']



**17. Implement Named Entity Recognizer**

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

# Dummy data
sentences = [
    "Apple is a company based in Cupertino.",
    "John works at Google.",
    "Paris is the capital of France.",
]

ner_labels = [
    ["B-org", "O", "O", "O", "B-geo", "O"],
    ["B-per", "O", "O", "O", "B-org", "O"],
    ["B-geo", "O", "O", "O", "B-geo", "O"],
]

# Tokenize words and NER labels
tokenizer_words = Tokenizer(oov_token="<OOV>")
tokenizer_labels = Tokenizer()

tokenizer_words.fit_on_texts(sentences)
tokenizer_labels.fit_on_texts(ner_labels)

vocab_size_words = len(tokenizer_words.word_index) + 1
vocab_size_labels = len(tokenizer_labels.word_index) + 1

# Convert sentences and NER labels to sequences
sequences_words = tokenizer_words.texts_to_sequences(sentences)
sequences_labels = tokenizer_labels.texts_to_sequences(ner_labels)

# Pad sequences to have the same length
max_len = max(len(seq) for seq in sequences_words)
padded_sequences_words = pad_sequences(sequences_words, padding='post', maxlen=max_len)
padded_sequences_labels = pad_sequences(sequences_labels, padding='post', maxlen=max_len)

# Convert NER labels to categorical format
categorical_labels = tf.keras.utils.to_categorical(padded_sequences_labels, num_classes=vocab_size_labels)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences_words, categorical_labels, test_size=0.2, random_state=42)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size_words, output_dim=8, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(16, return_sequences=True)))
model.add(TimeDistributed(Dense(vocab_size_labels, activation='softmax')))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=1, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Make predictions on new sentences
new_sentences = ["Microsoft is located in Redmond.", "Anna lives in Berlin."]
new_sequences = tokenizer_words.texts_to_sequences(new_sentences)
new_padded_sequences = pad_sequences(new_sequences, padding='post', maxlen=max_len)
predictions = model.predict(new_padded_sequences)

# Print predictions
for i, sentence in enumerate(new_sentences):
    predicted_ner_labels = [tokenizer_labels.index_word[np.argmax(pred)] for pred in predictions[i]]
    print(f"Sentence: {sentence}")
    print(f"Predicted NER Labels: {predicted_ner_labels}\n")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 1.5668
Test Accuracy: 0.5714
Sentence: Microsoft is located in Redmond.
Predicted NER Labels: ['o', 'o', 'o', 'o', 'o', 'o', 'o']

Sentence: Anna lives in Berlin.
Predicted NER Labels: ['o', 'o', 'o', 'o', 'o', 'o', 'o']



**18. Word sense disambiguation by LSTM/GRU**

In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

# Dummy data
sentences = [
    "I went to the bank to deposit some money.",
    "The river bank was full of fish.",
    "He sat on the bank of the river and enjoyed the view.",
]

word_senses = [1, 2, 3]  # Assign unique labels for each sense

# Tokenize words and word senses
tokenizer_words = Tokenizer(oov_token="<OOV>")
tokenizer_senses = Tokenizer()

tokenizer_words.fit_on_texts(sentences)
tokenizer_senses.fit_on_texts(map(str, word_senses))

vocab_size_words = len(tokenizer_words.word_index) + 1
vocab_size_senses = len(tokenizer_senses.word_index) + 1

# Convert sentences and word senses to sequences
sequences_words = tokenizer_words.texts_to_sequences(sentences)
sequences_senses = tokenizer_senses.texts_to_sequences(map(str, word_senses))

# Pad sequences to have the same length
max_len = max(len(seq) for seq in sequences_words)
padded_sequences_words = pad_sequences(sequences_words, padding='post', maxlen=max_len)
padded_sequences_senses = pad_sequences(sequences_senses, padding='post', maxlen=max_len)

# Convert word senses to categorical format
categorical_senses = tf.keras.utils.to_categorical(padded_sequences_senses, num_classes=vocab_size_senses)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences_words, categorical_senses, test_size=0.2, random_state=42)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size_words, output_dim=8, input_length=max_len, mask_zero=True))
model.add(LSTM(16, return_sequences=True))
model.add(Dense(vocab_size_senses, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=1, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Make predictions on new sentences
new_sentences = ["I saw a bird at the river bank.", "He invested money in the bank."]
new_sequences = tokenizer_words.texts_to_sequences(new_sentences)
new_padded_sequences = pad_sequences(new_sequences, padding='post', maxlen=max_len)
predictions = model.predict(new_padded_sequences)

# Print predictions
for i, sentence in enumerate(new_sentences):
    predicted_word_senses = [tokenizer_senses.word_index.get(np.argmax(pred), 'Unknown') for pred in predictions[i]]
    print(f"Sentence: {sentence}")
    print(f"Predicted Word Senses: {predicted_word_senses}\n")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 1.2918
Test Accuracy: 0.8889
Sentence: I saw a bird at the river bank.
Predicted Word Senses: ['Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown']

Sentence: He invested money in the bank.
Predicted Word Senses: ['Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown']



**19. Develop a Movie review system**

In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
import numpy as np

# Dummy data (replace this with your own dataset)
reviews = [
    "The movie was fantastic! I loved every moment of it.",
    "Terrible acting and a boring plot. I regret watching it.",
    "A masterpiece! The performances were outstanding.",
    "The film was okay, but nothing special.",
]

labels = [1, 0, 1, 0]  # 1 for positive, 0 for negative

# Tokenize words
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(reviews)

vocab_size = len(tokenizer.word_index) + 1

# Convert sentences to sequences
sequences = tokenizer.texts_to_sequences(reviews)

# Pad sequences to have the same length
max_len = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, padding='post', maxlen=max_len)

# Convert labels to numpy array
labels = np.array(labels)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=8, input_length=max_len))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=1, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Make predictions on new reviews
new_reviews = ["The best movie I've ever seen!", "Awful waste of time."]
new_sequences = tokenizer.texts_to_sequences(new_reviews)
new_padded_sequences = pad_sequences(new_sequences, padding='post', maxlen=max_len)
predictions = model.predict(new_padded_sequences)

# Print predictions
for i, review in enumerate(new_reviews):
    sentiment = "Positive" if predictions[i] > 0.5 else "Negative"
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {sentiment}\n")


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.7074
Test Accuracy: 0.0000




Review: The best movie I've ever seen!
Predicted Sentiment: Positive

Review: Awful waste of time.
Predicted Sentiment: Positive

