In [2]:
import re
import nltk
from nltk.tokenize import word_tokenize

# Sample text
corpus = "I am salman khan am salman"

# Tokenize and lowercase
tokens = [word.lower() for word in word_tokenize(corpus)]


In [3]:

from collections import Counter
from nltk import bigrams

# Get all bigrams
bigram_list = list(bigrams(tokens))

# Calculate frequency of each bigram
bigram_freq = Counter(bigram_list)


In [4]:
import numpy as np
from nltk import FreqDist

# Get unique words in the corpus
words = list(set(tokens))

# Create a frequency distribution for single words
word_freq = FreqDist(tokens)

# Initialize a probability matrix
prob_matrix = np.zeros((len(words), len(words)))

# Fill the probability matrix
for (w1, w2), count in bigram_freq.items():
    i = words.index(w1)
    j = words.index(w2)
    prob_matrix[i][j] = count / word_freq[w1]  # Probability of w2 given w1


In [5]:
#print count_matrix as dataframe with words as index and columns
import pandas as pd
df = pd.DataFrame(prob_matrix, index=words, columns=words)
df


Unnamed: 0,i,am,khan,salman
i,0.0,1.0,0.0,0.0
am,0.0,0.0,0.0,1.0
khan,0.0,1.0,0.0,0.0
salman,0.0,0.0,0.5,0.0


### Multinomial Naive Bayes

In [21]:
!pip install openpyxl

Collecting openpyxl
  Obtaining dependency information for openpyxl from https://files.pythonhosted.org/packages/6a/94/a59521de836ef0da54aaf50da6c4da8fb4072fb3053fa71f052fd9399e7a/openpyxl-3.1.2-py2.py3-none-any.whl.metadata
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Obtaining dependency information for et-xmlfile from https://files.pythonhosted.org/packages/96/c2/3dd434b0108730014f1b96fd286040dc3bcb70066346f7e01ec2ac95865f/et_xmlfile-1.1.0-py3-none-any.whl.metadata
  Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)
Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
   ---------------------------------------- 0.0/250.0 kB ? eta -:--:--
   --------------------------------------- 250.0/250.0 kB 16.0 MB/s eta 0:00:00
Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2



[notice] A new release of pip is available: 23.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
#use IMDB dataset and apply multinomial naive bayes and then use the model to predict the sentiment of the review


import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load the xlsx file as a DataFrame
df = pd.read_excel('Dataset C1.xlsx')
# preprocess each review by removing special characters and punctuations and then updated the dataframe
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['text'] = df['text'].apply(lambda x: x.lower())





In [24]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['type'], test_size=0.2, random_state=42)

# Apply Multinomial Naive Bayes on the train dataset
vectorizer = CountVectorizer() # Convert a collection of text documents to a matrix of token counts
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)




In [26]:
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict the sentiment of the test dataset
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Model accuracy:', accuracy)

#use the model to predict the sentiment of the review
text = "This movie was amazing! I loved every minute of it."
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()
text_vector = vectorizer.transform([text])
type = model.predict(text_vector)[0]
type


Model accuracy: 0.8942307692307693


'entertainment'

In [30]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['type'], test_size=0.2, random_state=42)

# Create a Bag of Words matrix using X_train and Y_train without using vectorizer
words = []
for review in X_train:
    words.extend(review.split())
words = list(set(words))

# Initialize a bow matrix with rows as the number of unique sentiments and columns as the number of unique words
sentiments = y_train.unique().tolist()  # Get unique sentiments
print(sentiments)
bow_matrix = np.zeros((len(y_train.unique()), len(words)))
# Fill the bag of words matrix using the frequency of each word in the class of y_train
for i, sentiment in enumerate(sentiments):
    sentiment_words = ' '.join(X_train[y_train == sentiment])
    for j, word in enumerate(words):
        bow_matrix[i, j] = sentiment_words.count(word)

# Generate a conditional probability matrix where the (i, j) element is the probability of word j given sentiment i and use Laplace smoothing with alpha = 1
alpha = 1
prob_matrix = (bow_matrix + alpha) / (bow_matrix.sum(axis=1)[:, None] + alpha * len(words))





['sports', 'entertainment']


In [36]:
# Predict the sentiment of the test dataset using the conditional probability matrix and the chain rule of probability
y_pred = []
for review in X_test:
    review_words = review.split()
    probs = []
    for i, sentiment in enumerate(sentiments):
        #initial value of probability is the prior probability of the sentiment
        prob = len(X_train[y_train == sentiment]) / len(X_train)
        for word in review_words:
            if word in words:
                j = words.index(word)
                prob *= prob_matrix[i, j]
        probs.append(prob)
    y_pred.append(sentiments[np.argmax(probs)])

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Model accuracy:', accuracy)

#use the model to predict the sentiment of the review
text = "This cricket match was amazing! I loved every minute of it."
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()
text_words = text.split()
probs = []
for i, sentiment in enumerate(sentiments):
    prob = len(X_train[y_train == sentiment]) / len(X_train)
    for word in text_words:
        if word in words:
            j = words.index(word)
            prob *= prob_matrix[i, j]
    probs.append(prob)
type = sentiments[np.argmax(probs)]
type


Model accuracy: 0.9134615384615384


'sports'

### Byte Pair Encoding

In [None]:
#random text
import re 


text = "new new wider wider wider newer newer newer newer newer newer lowest lowest low low low low low"
#remove punctuations
text = re.sub(r'[^\w\s]','',text)
vocab = {}
for word in text.split():
    new_word = ' '.join(list(word))+ ' </w>'
    if new_word in vocab:
        vocab[new_word] += 1
    else:
        vocab[new_word] = 1
print(vocab)

{'n e w </w>': 2, 'w i d e r </w>': 3, 'n e w e r </w>': 6, 'l o w e s t </w>': 2, 'l o w </w>': 5}


In [None]:
def find_pairs(vocab):
    pairs = {}
    for word, freq in vocab.items():
        elems = word.split()
        for i in range(len(elems) - 1):
            if (elems[i], elems[i + 1]) in pairs:
                pairs[(elems[i], elems[i + 1])] += freq
            else:
                pairs[(elems[i], elems[i + 1])] = freq
    return pairs


In [None]:
def find_best_pair(pairs):
    for pair, freq in pairs.items():
        if freq == max(pairs.values()):
            return list(pair)

In [None]:
def merge(vocab, merge_pair):
    new_vocab = {}
    pat = ' '.join(list(merge_pair))
    regex = re.compile(r'(?<!\S)' + pat + r'(?!\S)')
    for word in vocab:
        new_word = regex.sub(''.join(merge_pair), word)
        new_vocab[new_word] = vocab[word]
    return new_vocab
    

In [None]:
epochs = 20
test_text = "new newer lowest wider wider"
test_text = re.sub(r'[^\w\s]','',test_text)
test_vocab = {}
for word in test_text.split():
    new_word = ' '.join(list(word))+ ' </w>'
    if new_word in test_vocab:
        test_vocab[new_word] += 1
    else:
        test_vocab[new_word] = 1

list_rules = []
for epoch in range(epochs):
    pairs = find_pairs(vocab)
    if(len(pairs) == 0):
        break
    best_pair = find_best_pair(pairs)
    list_rules.append(best_pair)
    rule = ' '.join(list(best_pair))
    print(f'Rule : {rule}', end = " ")
    vocab = merge(vocab, best_pair)
    print(f'Epoch {epoch + 1}: {vocab}')


Rule : e r Epoch 1: {'n e w </w>': 2, 'w i d er </w>': 3, 'n e w er </w>': 6, 'l o w e s t </w>': 2, 'l o w </w>': 5}
Rule : er </w> Epoch 2: {'n e w </w>': 2, 'w i d er</w>': 3, 'n e w er</w>': 6, 'l o w e s t </w>': 2, 'l o w </w>': 5}
Rule : n e Epoch 3: {'ne w </w>': 2, 'w i d er</w>': 3, 'ne w er</w>': 6, 'l o w e s t </w>': 2, 'l o w </w>': 5}
Rule : ne w Epoch 4: {'new </w>': 2, 'w i d er</w>': 3, 'new er</w>': 6, 'l o w e s t </w>': 2, 'l o w </w>': 5}
Rule : l o Epoch 5: {'new </w>': 2, 'w i d er</w>': 3, 'new er</w>': 6, 'lo w e s t </w>': 2, 'lo w </w>': 5}
Rule : lo w Epoch 6: {'new </w>': 2, 'w i d er</w>': 3, 'new er</w>': 6, 'low e s t </w>': 2, 'low </w>': 5}
Rule : new er</w> Epoch 7: {'new </w>': 2, 'w i d er</w>': 3, 'newer</w>': 6, 'low e s t </w>': 2, 'low </w>': 5}
Rule : low </w> Epoch 8: {'new </w>': 2, 'w i d er</w>': 3, 'newer</w>': 6, 'low e s t </w>': 2, 'low</w>': 5}
Rule : w i Epoch 9: {'new </w>': 2, 'wi d er</w>': 3, 'newer</w>': 6, 'low e s t </w>': 2, 

In [None]:
for rule in list_rules:
    # pairs = find_pairs(test_vocab)
    curr_rule = ' '.join(list(rule))
    print(f'Rule : {curr_rule}', end = " ")
    test_vocab = merge(test_vocab, rule)
    print(f'{test_vocab}')

Rule : e r {'n e w </w>': 1, 'n e w er </w>': 1, 'l o w e s t </w>': 1, 'w i d er </w>': 2}
Rule : er </w> {'n e w </w>': 1, 'n e w er</w>': 1, 'l o w e s t </w>': 1, 'w i d er</w>': 2}
Rule : n e {'ne w </w>': 1, 'ne w er</w>': 1, 'l o w e s t </w>': 1, 'w i d er</w>': 2}
Rule : ne w {'new </w>': 1, 'new er</w>': 1, 'l o w e s t </w>': 1, 'w i d er</w>': 2}
Rule : l o {'new </w>': 1, 'new er</w>': 1, 'lo w e s t </w>': 1, 'w i d er</w>': 2}
Rule : lo w {'new </w>': 1, 'new er</w>': 1, 'low e s t </w>': 1, 'w i d er</w>': 2}
Rule : new er</w> {'new </w>': 1, 'newer</w>': 1, 'low e s t </w>': 1, 'w i d er</w>': 2}
Rule : low </w> {'new </w>': 1, 'newer</w>': 1, 'low e s t </w>': 1, 'w i d er</w>': 2}
Rule : w i {'new </w>': 1, 'newer</w>': 1, 'low e s t </w>': 1, 'wi d er</w>': 2}
Rule : wi d {'new </w>': 1, 'newer</w>': 1, 'low e s t </w>': 1, 'wid er</w>': 2}
Rule : wid er</w> {'new </w>': 1, 'newer</w>': 1, 'low e s t </w>': 1, 'wider</w>': 2}
Rule : new </w> {'new</w>': 1, 'newer</w

### Maximum Length

In [None]:
from nltk.corpus import words as tokens
#write a random paragraph

paragraph = "I am a student of computer science and engineering. I am currently studying at the University of Asia Pacific. I am a very good student. I am very much interested in programming. I am also interested in machine learning. I am also interested in data science. I am also interested in artificial intelligence. I am also interested in deep learning. I am also interested in computer vision. I am also interested in natural language processing. I am also interested in robotics. I am also interested in computer graphics. I am also interested in computer networks. I am also interested in cyber security. I am also interested in web development. I am also interested in mobile application development. I am also interested in game development. I am also interested in software development. I am also interested in hardware development. I am also interested in computer architecture. I am also interested in computer organization. I am also interested in computer design. I am also interested in computer engineering. I am also interested in computer science"

#remove punctuations from the paragraph
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
for x in paragraph.lower():
    if x in punctuations:
        paragraph = paragraph.replace(x, "")

paragraph = paragraph.lower()
#remove all the spaces from the paragraph
paragraph = paragraph.replace(" ", "")

words = []
lowerTokensSet = set([x.lower() for x in tokens.words()])
i = 0
while i < len(paragraph):
    mWord = ""
    for j in range(i, len(paragraph)):
        temp = paragraph[i:j+1]
        temp = temp.lower()
        if temp in lowerTokensSet and len(temp) > len(mWord):
            mWord = temp
    i = i+len(mWord)
    words.append(mWord)

newParagraph = " ".join(words)
print(newParagraph)


