# Loading of data


In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

input_dir = os.path.join(os.getcwd(), 'data')
input_text_file = os.path.join(input_dir, 'corpus.txt')
label_file = os.path.join(input_dir, 'labels.txt')

with open(input_text_file, 'r') as f:
    corpus = f.readlines()
    for i in range(len(corpus)):
        corpus[i] = corpus[i][:-1]

with open(label_file, 'r') as f:
    labels = f.readlines()
    for i in range(len(labels)):
        labels[i] = labels[i][:-1]

In [3]:
len(corpus), len(labels)

(2400, 2400)

# Emotion score

In [4]:
from transformers import pipeline
import time

classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True,)

def emotion_scores(sample): 
    emotion=classifier(sample)
    return emotion[0]

start_time = time.time()
sample = "I am so happy to see you!"
all_classes = emotion_scores(sample)
for info in all_classes:
    print(info)
end_time = time.time()
print(type(all_classes))
print("Time taken for emotion_scores: ", end_time-start_time)


{'label': 'sadness', 'score': 0.00029859962523914874}
{'label': 'joy', 'score': 0.9987986087799072}
{'label': 'love', 'score': 0.0004451328422874212}
{'label': 'anger', 'score': 0.0001878843759186566}
{'label': 'fear', 'score': 0.00012197871546959504}
{'label': 'surprise', 'score': 0.00014771465794183314}
<class 'list'>
Time taken for emotion_scores:  0.7797598838806152


# Bigram Model for Sentence Generation

In [87]:
from typing import List, Dict, Tuple
import random


class BigramLanguageModel:
    def __init__(self, corpus, labels):
        self.corpus = corpus
        self.labels = labels
        self.bigram_counts = {}
        self.unigram_counts = {}
        self.vocabulary = set()
        self.total_bigram_pairs = None
        self.bigram_prob = {}
        
        
    def train(self):
        '''
        Train the Bigram language model on the corpus and labels.
        '''
        self.count_unigrams()
        self.count_bigrams()
    
    
    def count_unigrams(self):
        '''
        Count the unigrams in the corpus and store counts in the unigram_counts dictionary
        '''
        for i in range(len(self.corpus)):
            sentence = self.corpus[i]
            tokens = ['</start>'] + sentence.split() + ['</end>']
            label = self.labels[i]
            for token in tokens:
                self.unigram_counts[token] = self.unigram_counts.get(token, 0) + 1
                self.vocabulary.add(token)
        self.vocabulary.remove('</start>')
        self.vocabulary.remove('</end>')
        return
    
    
    def count_bigrams(self):
        '''
        Count the bigrams in the corpus and store counts in the bigram_counts dictionary
        '''
        for i in range(len(self.corpus)):
            sentence = self.corpus[i]
            tokens = ['</start>'] + sentence.split() + ['</end>']
            label = self.labels[i]
            bi_grams = self.get_bigrams(tokens)            
            
            for bi_gram in bi_grams:
                context = bi_gram[0]
                token = bi_gram[1]
                if context not in self.bigram_counts:
                    self.bigram_counts[context] = {}
                self.bigram_counts[context][token] = self.bigram_counts[context].get(token, 0) + 1
        return
    
    
    def get_bigrams(self, tokens: List[str]):
        '''
        Given a list of tokens, return a list of possible bigrams
        '''
        bigrams = []
        for i in range(len(tokens) - 1):
            bigrams.append((tokens[i], tokens[i+1]))
        return bigrams
    
    
    def get_bigram_prob(self, context: str, token: str, smoothing: str = 'none'):
        '''
        Get the probability of the token given the context
        '''
        smoothing = smoothing.lower()
        if smoothing == 'none':
            return self.__get_bigram_prob_normal(context, token)
        elif smoothing == 'laplace':
            return self.__get_bigram_prob_laplace(context, token)
        elif smoothing == 'kneser-ney':
            return self.__get_bigram_prob_kneser_ney(context, token)
        else:
            raise ValueError('Smoothing method not supported')
    
    
    def __get_bigram_prob_normal(self, context, token):
        context_token_cnt = self.bigram_counts[context].get(token, 0)
        return context_token_cnt / self.unigram_counts[context]


    def __get_bigram_prob_laplace(self, context, token):
        context_token_cnt = self.bigram_counts[context].get(token, 0)
        return (context_token_cnt + 1) / (self.unigram_counts[context] + len(self.vocabulary))
    
    
    def __get_bigram_prob_kneser_ney(self, context, token, avg_discount=0.7):
        d = avg_discount
        context_token_cnt = self.bigram_counts[context].get(token, 0)
        
        # Calculate alpha, which depends on the context
        alpha = d * len(self.bigram_counts[context]) / self.unigram_counts[context]
        # Calculate Continuation Probability, which depends on the token
        bigram_with_token_cnt = 0
        for _context_ in self.bigram_counts:
            bigram_with_token_cnt += 1 if token in self.bigram_counts[_context_] else 0
        total_bigram_pairs = self.__count_total_bigram_pairs()
        P_continuation = bigram_with_token_cnt / total_bigram_pairs
    
        return (max(context_token_cnt - d, 0) / self.unigram_counts[context]) + (alpha * P_continuation)
    
    
    def __count_total_bigram_pairs(self):
        '''
        Count the total number of unaiue bigram pairs in the corpus
        '''
        if self.total_bigram_pairs == None:
            self.total_bigram_pairs = 0
            for context in self.bigram_counts:
                self.total_bigram_pairs += len(self.bigram_counts[context])
        return self.total_bigram_pairs
    
    
    def __generate_bigram_prob_for_context(self, context, smoothing:str = 'none'):
        '''
        Generate bigram probabilities for all tokens for a given context
        '''
        if context not in self.bigram_prob:
            self.bigram_prob[context] = {}
            for token in self.bigram_counts[context]:
                self.bigram_prob[context][token] = self.get_bigram_prob(context, token, smoothing)
        return self.bigram_prob[context]
    
    
    def __generate_token(self, context:str, smoothing:str = 'none'):
        '''
        Generate a token given the context
        '''
        all_possible_tokens = self.__generate_bigram_prob_for_context(context, smoothing)
        generated_token = random.choices(list(all_possible_tokens.keys()), weights=list(all_possible_tokens.values()), k=1)[0]
        return generated_token
    
    
    def generate_sentence(self, max_length: int = 10, smoothing: str = 'none'):
        '''
        Generate a sentence of the given max_length
        '''
        sentence = []
        context = '</start>'
        length = 0
        for _ in range(max_length):
            token = self.__generate_token(context, smoothing)
            if token == '</end>':
                if length < 10:
                    sentence[-1] = sentence[-1] + '.'
                    context = '</start>'
                    continue
                else:
                    break 
            sentence.append(token)
            context = token
            length += 1
        return ' '.join(sentence)        

### Initialisation of the model

In [94]:
bigram_lm = BigramLanguageModel(corpus, labels)
bigram_lm.train()

prob = bigram_lm.get_bigram_prob('i', 'fine', smoothing='kneser-ney')
prob

1.5862443846717553e-05

In [95]:
bigram_lm.generate_sentence(max_length=10, smoothing='kneser-ney')

'i dont even if i feel have been clubbed upside'

### Generating top 5 Bigrams


In [96]:
smoothing_methods = ['none', 'laplace', 'kneser-ney']

for smoothing in smoothing_methods:
    all_bigram = {}
    for context in bigram_lm.bigram_counts:
        for token in bigram_lm.bigram_counts[context]:
            all_bigram[(context, token)] = bigram_lm.get_bigram_prob(context, token, smoothing)
    
    top_count = 10
    top_bigrams = sorted(all_bigram, key=all_bigram.get, reverse=True)[:top_count]
    print(f"Top {top_count} bigrams with smoothing method: {smoothing}")
    print(f"Total bigrams: {len(all_bigram)}")
    for bigram in top_bigrams:
        print(f"{bigram}: {all_bigram[bigram]}")
    print()

Top 10 bigrams with smoothing method: none
Total bigrams: 25681
('href', 'http'): 1.0
('mooshilu', '</end>'): 1.0
('tychelle', 'to'): 1.0
('hang', 'out'): 1.0
('nonexistent', 'social'): 1.0
('alex', 'and'): 1.0
('marriage', 'and'): 1.0
('personifying', 'an'): 1.0
('progeny', 'who'): 1.0
('genuflecting', 'at'): 1.0

Top 10 bigrams with smoothing method: laplace
Total bigrams: 25681
('</start>', 'i'): 0.2693830629710052
('i', 'feel'): 0.11043610327619874
('feel', 'like'): 0.0350976507217662
('i', 'am'): 0.03189412019960946
('</start>', 'im'): 0.02720653978796781
('that', 'i'): 0.02650602409638554
('and', 'i'): 0.023103748910200523
('im', 'feeling'): 0.022454576619814877
('i', 'was'): 0.021913647211976566
('to', 'be'): 0.01861427094105481

Top 10 bigrams with smoothing method: kneser-ney
Total bigrams: 25681
('href', 'http'): 0.9720021806004439
('don', 't'): 0.9712049203427449
('didn', 't'): 0.9611413972283877
('sort', 'of'): 0.9594087640897253
('supposed', 'to'): 0.9238243578261491
('doe

# Bigram Model for Emotion Sentence Generation

In [1]:
import random
import numpy as np
from typing import List


class BigramLM_Emotion:
    def __init__(self, corpus, labels):
        self.corpus = corpus
        self.labels = labels
        self.bigram_counts = {}
        self.unigram_counts = {}
        self.vocabulary = set()
        self.total_bigram_pairs = None
        self.bigram_emotion_vector = {}
        self.bigram_prob = {}
        self.class_to_idx = {'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}
        self.num_labels = len(self.class_to_idx)
        
        
    def train(self):
        '''
        Train the Bigram language model on the corpus and labels.
        '''
        self.count_unigrams()
        self.count_bigrams()
            
    
    def count_unigrams(self):
        '''
        Count the unigrams in the corpus and store counts in the unigram_counts dictionary
        '''
        for i in range(len(self.corpus)):
            sentence = self.corpus[i]
            tokens = ['</start>'] + sentence.split() + ['</end>']
            label = self.labels[i]
            for token in tokens:
                self.unigram_counts[token] = self.unigram_counts.get(token, 0) + 1
                self.vocabulary.add(token)
        self.vocabulary.remove('</start>')
        self.vocabulary.remove('</end>')
        return
    
    
    def count_bigrams(self):
        '''
        Count the bigrams in the corpus and store counts in the bigram_counts dictionary
        '''
        for i in range(len(self.corpus)):
            if i % 100 == 0:
                print(i, end=' ')
            sentence = self.corpus[i]
            emotion = emotion_scores(sentence)
            label = self.labels[i]
            tokens = ['</start>'] + sentence.split() + ['</end>']
            bi_grams = self.get_bigrams(tokens)            
            
            for bi_gram in bi_grams:
                context = bi_gram[0]
                token = bi_gram[1]
                if context not in self.bigram_counts:
                    self.bigram_counts[context] = {}
                self.bigram_counts[context][token] = self.bigram_counts[context].get(token, 0) + 1
                if context not in self.bigram_emotion_vector:
                    self.bigram_emotion_vector[context] = {}
                # self.bigram_emotion_vector[context][token] = self.bigram_emotion_vector[context].get(token, [])
                emotion_vector = [0] * self.num_labels
                for i in range(len(emotion)):
                    emotion_vector[self.class_to_idx[emotion[i]['label']]] = emotion[i]['score']
                self.bigram_emotion_vector[context][token] = self.bigram_emotion_vector[context].get(token, np.array([0]*self.num_labels)) + np.array(emotion_vector)
        return
    
    
    def get_bigrams(self, tokens:List[str]):
        '''
        Given a list of tokens, return a list of possible bigrams
        '''
        bigrams = []
        for i in range(len(tokens) - 1):
            bigrams.append((tokens[i], tokens[i+1]))
        return bigrams
    
    
    def get_bigram_prob(self, context:str, token:str, beta_score:float, smoothing:str='kneser-ney'):
        '''
        Get the probability of the token given the context
        '''
        smoothing = smoothing.lower()
        if smoothing == 'none':
            return self.__get_bigram_prob_normal(context, token, beta_score)
        elif smoothing == 'laplace':
            return self.__get_bigram_prob_laplace(context, token, beta_score)
        elif smoothing == 'kneser-ney':
            return self.__get_bigram_prob_kneser_ney(context, token, beta_score)
        else:
            raise ValueError('Smoothing method not supported')
    
    
    def __get_bigram_prob_normal(self, context, token, beta_score):
        context_token_cnt = self.bigram_counts[context].get(token, 0)
        return beta_score**2 + 2*beta_score*context_token_cnt/(self.unigram_counts[context]*np.log(beta_score) + 1)


    def __get_bigram_prob_laplace(self, context, token, beta_score):
        context_token_cnt = self.bigram_counts[context].get(token, 0)
        return beta_score + beta_score*(context_token_cnt+1)/(self.unigram_counts[context]+len(self.vocabulary))
    
    
    def __get_bigram_prob_kneser_ney(self, context, token, beta_score, avg_discount=0.7):
        d = avg_discount
        context_token_cnt = self.bigram_counts[context].get(token, 0)
        
        # Calculate alpha, which depends on the context
        alpha = d * len(self.bigram_counts[context]) / self.unigram_counts[context]
        # Calculate Continuation Probability, which depends on the token
        bigram_with_token_cnt = 0
        for _context_ in self.bigram_counts:
            bigram_with_token_cnt += 1 if token in self.bigram_counts[_context_] else 0
        total_bigram_pairs = self.__count_total_bigram_pairs()
        P_continuation = bigram_with_token_cnt / total_bigram_pairs
    
        return (beta_score + beta_score*max(context_token_cnt-d,0)/self.unigram_counts[context]) + (alpha * P_continuation)
    
    
    def __count_total_bigram_pairs(self):
        '''
        Count the total number of unique bigram pairs in the corpus
        '''
        if self.total_bigram_pairs == None:
            self.total_bigram_pairs = 0
            for context in self.bigram_counts:
                self.total_bigram_pairs += len(self.bigram_counts[context])
        return self.total_bigram_pairs
    
    
    def __generate_bigram_prob_for_context(self, context, emotion:str, smoothing:str='kneser-ney'):
        '''
        Generate bigram probabilities for all tokens for a given context
        '''
        if context not in self.bigram_prob:
            self.bigram_prob[context] = {}
        if emotion not in self.bigram_prob[context]:
            self.bigram_prob[context][emotion] = {}
            emotion_vector = np.array([0] * self.num_labels)
            emotion_vector[self.class_to_idx[emotion]] = 1

            # calculate beta scores and normalize them
            beta_scores = {}
            total_score = 0
            for token in self.bigram_counts[context]:
                score = np.sum(emotion_vector * self.bigram_emotion_vector[context][token])
                # Best score combinations
                # score * np.log(score), score * (1 + np.log(score)), score * (np.e + np.log(score)) 
                beta_scores[token] =  score # * np.log(score)
                total_score += beta_scores[token]
            for token in beta_scores:
                beta_scores[token] = beta_scores[token] / total_score
                
            # calculate bigram probabilities
            for token in self.bigram_counts[context]:
                self.bigram_prob[context][emotion][token] = self.get_bigram_prob(context, token, beta_scores[token], smoothing)
            
            # normalize bigram probabilities
            total_prob = sum(self.bigram_prob[context][emotion].values())
            for token in self.bigram_prob[context][emotion]:
                self.bigram_prob[context][emotion][token] = self.bigram_prob[context][emotion][token] / total_prob

        return self.bigram_prob[context][emotion]
    
    
    def __generate_token(self, context:str, emotion:str, smoothing:str='kneser-ney'):
        '''
        Generate a token given the context
        '''
        all_possible_tokens = self.__generate_bigram_prob_for_context(context, emotion, smoothing)
        generated_token = random.choices(list(all_possible_tokens.keys()), weights=list(all_possible_tokens.values()))[0]
        return generated_token
    
    
    def generate_sentence(self, emotion:str, max_length:int=10, smoothing:str='kneser-ney'):
        '''
        Generate a sentence of the given max_length
        '''
        sentence = []
        context = '</start>'
        length = 0
        for _ in range(max_length):
            token = self.__generate_token(context, emotion, smoothing)
            if  token == '</end>':
                if length < 7:
                    sentence[-1] = sentence[-1] + '.'
                    # sentence.append('.')
                    context = '</start>'
                    continue
                else:
                    break
            length += 1
            sentence.append(token)
            context = token
        return ' '.join(sentence)

## Initialize the model

In [76]:
lm = BigramLM_Emotion(corpus, labels)
lm.train()

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 

In [68]:
lm.class_to_idx

{'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}

In [103]:
lm.generate_sentence('joy', max_length=30, smoothing='none')

'i feel special gifts. i feel the fact that what i feel good person in the homes that if i feel so feel relieved that core of the drive'

## Save the model

In [81]:
import pickle
model_file = os.path.join(os.getcwd(), 'bigram_lm_emotion.pkl')
pickle.dump(lm, open(model_file, 'wb'))

## Load the model

In [2]:
import pickle
import os

model_file = os.path.join(os.getcwd(), 'bigram_lm_emotion.pkl')
lm_saved = pickle.load(open(model_file, 'rb'))

## Generate different emotion sentences using the LM 

In [7]:
all_emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
output_dir = os.path.join(os.getcwd(), 'output')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
for emotion in all_emotions:
    print(f'Generating sentences for {emotion}')
    output_file = os.path.join(output_dir, f'gen_{emotion}.txt')
    label_file = os.path.join(output_dir, f'gen_label_{emotion}.txt')
    
    with open(label_file, 'w') as f:
        f.write(f'Labels for {emotion}\n\n')
        
    with open(output_file, 'w') as f:
        sentences = []
        outputs = []
        samples = 50
        generated = 0
        trails = 0
        while generated < samples:
            sentence = lm.generate_sentence(emotion, max_length=30, smoothing='none')
            emotions = emotion_scores(sentence)
            trails += 1
            
            max_score = 0
            max_label = ''
            for info in emotions:
                if info['score'] > max_score:
                    max_score = info['score']
                    max_label = info['label']
            if max_label != emotion:
                continue
            
            f.write(sentence + '\n')
            with open(label_file, 'a') as f_label:
                f_label.write(max_label + ' ' + str(max_score) + '\n')
            sentences.append(sentence)
            outputs.append(emotions)
            generated += 1
        
    print(f'Generated {generated} sentences for {emotion} in {trails} trails')
    print(f'Accuracy for {emotion}: {generated/trails*100}\n')

Generating sentences for sadness
Generated 50 sentences for sadness in 51 trails
Accuracy for sadness: 98.0392156862745

Generating sentences for joy
Generated 50 sentences for joy in 54 trails
Accuracy for joy: 92.5925925925926

Generating sentences for love
Generated 50 sentences for love in 55 trails
Accuracy for love: 90.9090909090909

Generating sentences for anger
Generated 50 sentences for anger in 60 trails
Accuracy for anger: 83.33333333333334

Generating sentences for fear
Generated 50 sentences for fear in 50 trails
Accuracy for fear: 100.0

Generating sentences for surprise
Generated 50 sentences for surprise in 50 trails
Accuracy for surprise: 100.0



# Finding the Best SVC Classifier using GridSearchCV

### Vectorization of the data

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer # define vectorizer
import numpy as np

vectorizer = TfidfVectorizer()
sparse_X = vectorizer.fit_transform(corpus)

In [6]:
sparse_X[0,:]

<1x5410 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

### Creating Training data

In [7]:
labels_dict = {"sadness":0, "love":1, "anger":2, "joy":3, "fear":4, "surprise":5} # true label to its index
X = sparse_X.todense()
X = np.array(X)
Y = [labels_dict[label] for label in labels]

### Creating Testing Data

In [8]:
import os
dir = os.path.join(os.getcwd(), 'output')

sentence_files = ['gen_sadness.txt', 'gen_love.txt', 'gen_anger.txt', 'gen_joy.txt', 'gen_fear.txt', 'gen_surprise.txt']
label_files = ['gen_label_sadness.txt', 'gen_label_love.txt', 'gen_label_anger.txt', 'gen_label_joy.txt', 'gen_label_fear.txt', 'gen_label_surprise.txt']
X_test = []
Y_test = []

for i in range(len(sentence_files)):
    with open(os.path.join(dir, sentence_files[i]), 'r') as f:
        sentences = f.readlines()
    with open(os.path.join(dir, label_files[i]), 'r') as f:
        labels = f.readlines()
    
    for i in range(len(sentences)):
        sentences[i] = sentences[i][:-1]
    for i in range(2, len(labels)):
        labels[i] = labels[i].split()[0]
    X_test.extend(sentences)
    Y_test.extend(labels[2:])
    
print(len(X_test), len(Y_test))

300 300


In [16]:
sparse_X_test = vectorizer.transform(X_test)
Y_test = [labels_dict[label] for label in Y_test]

### Performing Grid Search

In [11]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear', 'rbf', 'poly'), 'C':[1, 10, 100]}
svc = svm.SVC(gamma='scale', random_state=32)
clf = GridSearchCV(svc, parameters, return_train_score=True)

In [12]:
clf.fit(X,Y)

In [13]:
clf.cv_results_

{'mean_fit_time': array([26.94461274, 25.1282506 , 23.86464734, 26.97547231, 26.74716873,
        28.79105825, 28.14219875, 28.04665427, 28.71511173]),
 'std_fit_time': array([2.95685828, 2.71396016, 0.82983448, 3.11018003, 2.33478695,
        0.21903727, 0.11040663, 0.25434177, 0.14740535]),
 'mean_score_time': array([4.15251136, 5.44599781, 3.64564052, 3.95802684, 7.07484355,
        4.37392488, 4.10864515, 7.6225163 , 4.36852808]),
 'std_score_time': array([0.52845696, 1.52498603, 0.14484277, 0.52546064, 1.46445381,
        0.07239296, 0.03956218, 0.08770424, 0.06181305]),
 'param_C': masked_array(data=[1, 1, 1, 10, 10, 10, 100, 100, 100],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'poly', 'linear', 'rbf', 'poly',
                    'linear', 'rbf', 'poly'],
              mask=[False, False, False, False, False, False,

In [14]:
clf.best_estimator_

In [21]:
clf.best_score_

0.7404166666666667

In [15]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

### Evaluation on Training data

In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [18]:
Y_predict_train = clf.predict(X)
acc_train = accuracy_score(Y, Y_predict_train)
f1_train = f1_score(Y, Y_predict_train, average = "macro")

print("Train Accuracy: ", acc_train)
print("Train F1: ", f1_train)

Train Accuracy:  0.98125
Train F1:  0.9812124418156302


### Evaluation on Testing Data


In [19]:
Y_predict_test = clf.predict(np.asarray(sparse_X_test.todense()))

acc_test = accuracy_score(Y_test, Y_predict_test)
f1_test = f1_score(Y_test, Y_predict_test, average = "macro")
print("Test Accuracy: ", acc_test)
print("Test F1: ", f1_test)

Test Accuracy:  0.98
Test F1:  0.9799277398635087


In [20]:
best_svc = svm.SVC(gamma='scale', random_state=32, C=1.0, kernel="linear")
best_svc.fit(X, Y)
Y_predict_test = best_svc.predict(np.asarray(sparse_X_test.todense()))

acc_test = accuracy_score(Y_test, Y_predict_test)
f1_test = f1_score(Y_test, Y_predict_test, average = "macro")
print("Test Accuracy: ", acc_test)
print("Test F1: ", f1_test)

Test Accuracy:  0.98
Test F1:  0.9799277398635087
