### Statistical Language Models

In [None]:
from collections import defaultdict
import nltk

# Sample corpus
corpus = ["the cat sat on the mat", "the dog ate the bone"]

# Tokenize the corpus
tokens = [word for sentence in corpus for word in nltk.word_tokenize(sentence)]

# Function to build an N-gram language model
def build_ngram_model(corpus, n):
    ngrams = defaultdict(list)
    for i in range(len(corpus) - n + 1):
        context = tuple(corpus[i:i + n - 1])
        next_word = corpus[i + n - 1]
        ngrams[context].append(next_word)
    return ngrams

# Example usage
ngram_model = build_ngram_model(tokens, 2)
print(ngram_model)


defaultdict(<class 'list'>, {('the',): ['cat', 'mat', 'dog', 'bone'], ('cat',): ['sat'], ('sat',): ['on'], ('on',): ['the'], ('mat',): ['the'], ('dog',): ['ate'], ('ate',): ['the']})


In [None]:
import random

# Function to generate text using an N-gram language model
def generate_text(model, n, num_words):
    current_context = random.choice(list(model.keys()))
    generated_text = list(current_context)
    for _ in range(num_words):
        next_word = random.choice(model[current_context])
        generated_text.append(next_word)
        current_context = tuple(generated_text[-(n - 1):])
    return ' '.join(generated_text)

# Example usage
generated_text = generate_text(ngram_model, 2, 10)
print(generated_text)


cat sat on the mat the cat sat on the bone


### Markov chain simulation

In [None]:
import numpy as np

# Transition matrix for a simple 2-state Markov chain
transition_matrix = np.array([[0.7, 0.3],   # Probability of going from state 0 to state 0 or state 1
                              [0.4, 0.6]])  # Probability of going from state 1 to state 0 or state 1

# Initial state probabilities
initial_state = np.array([0.5, 0.5])  # Equal probability of starting in state 0 or state 1

# Simulate Markov Chain
num_steps = 10
current_state = np.random.choice([0, 1], p=initial_state)  # Start from initial state
for _ in range(num_steps):
    print("Current State:", current_state)
    current_state = np.random.choice([0, 1], p=transition_matrix[current_state])


Current State: 1
Current State: 0
Current State: 0
Current State: 0
Current State: 1
Current State: 0
Current State: 1
Current State: 1
Current State: 1
Current State: 1


### Markov Chain text generator

In [None]:
! pip install markovify

Collecting markovify
  Downloading markovify-0.9.4.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting unidecode (from markovify)
  Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: markovify
  Building wheel for markovify (setup.py) ... [?25l[?25hdone
  Created wheel for markovify: filename=markovify-0.9.4-py3-none-any.whl size=18608 sha256=4b1d3a5361ef192fa98481bd95a8a8f687fd2137f75c9122d8f86594988b6b5c
  Stored in directory: /root/.cache/pip/wheels/ca/8c/c5/41413e24c484f883a100c63ca7b3b0362b7c6f6eb6d7c9cc7f
Successfully built markovify
Installing collected packages: unidecode, markovify
Successfully installed markovify-0.9.4 unidecode-1.3.8


In [None]:
import markovify

# open the text file that you wish to train the model with
with open("/content/poetry.txt") as f:
    text = f.read()

# Build the model.
text_model = markovify.Text(text)

# Print five randomly-generated sentences
for i in range(5):
    print(text_model.make_sentence())

print("="*55)
# Print three randomly-generated sentences of no more than 280 characters
for i in range(3):
    print(text_model.make_short_sentence(280))

Sing a song full of the stairs, counting each drop of pearl made with water— blue on the thumb of our silent tears, Thou who hast by Thy might, Led us into the mountains of Japan.
It belongs to the masquerade.
The blonde has her eyes bestowed upon her warrior!
Her nights were dreams of the old days I have this trowel, these overalls, this ridiculous hat now.
Ere sleep comes down to soothe the weary eyes, How all the cark of time had flown, And I was crazy try’n’a dream us up a future even if I were descending into a new desire.
Like petals of a myth that Hard Rock did nothing.
You, Lord, as you have left is that middle finger around your God-given right to be listening anyway.
What kind of clemency, against the tall forest of sharp pines, the morning air fulvousWith a metallic syncopation,A key to a grim quietude.


### Naïve Bayes classifier for text classification

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# Sample text data for training
train_text = ["I love this movie", "This movie is great", "I dislike this movie", "This movie is awful"]

# Corresponding labels for each text sample
train_labels = np.array([1, 1, 0, 0])  # 1 for positive sentiment, 0 for negative sentiment

# Create a pipeline with CountVectorizer and MultinomialNB
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Train the model
model.fit(train_text, train_labels)

# Sample test data for prediction
test_text = ["I love this product", "This product is terrible"]

# Predict sentiment for test data
predicted_labels = model.predict(test_text)

# Output the predicted labels
for text, label in zip(test_text, predicted_labels):
    sentiment = "positive" if label == 1 else "negative"
    print(f"Text: {text} -- Predicted Sentiment: {sentiment}")


Text: I love this product -- Predicted Sentiment: positive
Text: This product is terrible -- Predicted Sentiment: negative


### probability distributions in an N-gram model

In [None]:
from collections import defaultdict

# Sample corpus
corpus = ["the cat sat on the mat", "the dog ate the bone"]

# Create unigram probability distribution
unigram_counts = defaultdict(int)
total_words = 0
for sentence in corpus:
    for word in sentence.split():
        unigram_counts[word] += 1
        total_words += 1

unigram_probabilities = {word: count / total_words for word, count in unigram_counts.items()}
print("Unigram Probabilities:", unigram_probabilities)


Unigram Probabilities: {'the': 0.36363636363636365, 'cat': 0.09090909090909091, 'sat': 0.09090909090909091, 'on': 0.09090909090909091, 'mat': 0.09090909090909091, 'dog': 0.09090909090909091, 'ate': 0.09090909090909091, 'bone': 0.09090909090909091}


###  N-gram Language Models

In [None]:
from collections import Counter
import numpy as np

# Sample corpus
corpus = ["the cat sat on the mat", "the dog ate the bone"]

# Tokenize the corpus
tokens = [word for sentence in corpus for word in sentence.split()]

# Calculate unigram counts
unigram_counts = Counter(tokens)

# Vocabulary size
V = len(set(tokens))

# Laplace Smoothing
def laplace_smoothing(word, k=1):
    return (unigram_counts[word] + k) / (len(tokens) + k * V)

# Additive Smoothing
def additive_smoothing(word, alpha=0.5):
    return (unigram_counts[word] + alpha) / (len(tokens) + alpha * V)

# Good-Turing Smoothing
def good_turing_smoothing(word):
    # Count of counts
    counts_of_counts = Counter(unigram_counts.values())
    c = unigram_counts[word]
    N1 = counts_of_counts[1]  # Count of unigrams that occur once
    if c + 1 in counts_of_counts:
        Nc_plus_1 = counts_of_counts[c + 1]
    else:
        Nc_plus_1 = 0
    return (c + 1) * Nc_plus_1 / N1 / len(tokens)

# Example usage
word = "cat"
print("Laplace Smoothing:", laplace_smoothing(word))
print("Additive Smoothing:", additive_smoothing(word))
print("Good-Turing Smoothing:", good_turing_smoothing(word))


Laplace Smoothing: 0.10526315789473684
Additive Smoothing: 0.1
Good-Turing Smoothing: 0.0


### Pruning technique


In [None]:
from collections import defaultdict
import nltk
nltk.download('punkt')

# Sample corpus
corpus = ["the cat sat on the mat", "the dog ate the bone"]

# Tokenize the corpus
tokens = [word.lower() for sentence in corpus for word in nltk.word_tokenize(sentence)]

# Function to build a unigram language model with pruning
def build_pruned_unigram_model(corpus, threshold):
    unigram_counts = defaultdict(int)
    total_words = 0
    for word in corpus:
        unigram_counts[word] += 1
        total_words += 1
    pruned_unigram_model = {word: count / total_words for word, count in unigram_counts.items() if count >= threshold}
    return pruned_unigram_model

# Build the pruned unigram language model
threshold = 2  # Example threshold for pruning
pruned_unigram_model = build_pruned_unigram_model(tokens, threshold)

print("Pruned Unigram Model:")
print(pruned_unigram_model)
