In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import copy
import random

## Reading data from file and split into test and train part

In [2]:
poams_data = pd.read_csv("Data/train_test.csv")
X_train, X_test = train_test_split(poams_data, test_size=0.2, random_state=12347, shuffle = True)


## Implementing Train class
This class has some methods such as "get_words", "calculate_probabilities", "get_max_result", "find_unknown_indexes"
and "estimate". at first "get_words" method makes a data structure of all the words that exist in lines of Hafez Poetry and number of repetitions of each word, and another data structure with the exact same form and data for Saadi.
then "calculate probabilities" method calculates the probability of existence of each word in both data structure by number of their repetitions in each one.
"estimate" method, estimates label of a poetry line by data trained. if a word does not exist in both data structure, we ignore it and do not consider the probability of that word. but if a word does not exist in just  a data structure we consider zero as its probability in this data structure.

In [3]:
class Train:
    def __init__(self, data):
        self.data = copy.deepcopy(data)
        self.hafez_words = dict()
        self.saadi_words = dict()
        
        self.saadi_word_number = 0
        self.hafez_word_number = 0
        
        self.hafez_probabs = dict()
        self.saadi_probabs = dict()
        
        self.saadi_self_probab = 0
        self.hafez_self_probab = 0
        
        self.hafez_lines = 0
        self.saadi_lines = 0

    def get_words(self):
        for i, j in self.data.iterrows():
            if j.label == "hafez":
                self.hafez_lines += 1
            elif j.label == "saadi":
                self.saadi_lines += 1
                
            for w in j.text.split():
                if j.label == 'hafez':
                    self.hafez_word_number += 1
                    if w in self.hafez_words:
                        self.hafez_words[w] += 1
                    else:
                        self.hafez_words[w] = 1
                    
                elif j.label == 'saadi':
                    self.saadi_word_number += 1
                    if w in self.saadi_words:
                        self.saadi_words[w] += 1
                    else:
                        self.saadi_words[w] = 1
        self.hafez_words["fatemeh"] = 0
        self.saadi_words["fatemeh"] = 0
                    
    def calculate_probabilities(self):
        self.hafez_self_probab = self.hafez_lines / (self.hafez_lines + self.saadi_lines)
        self.saadi_self_probab = self.saadi_lines / (self.hafez_lines + self.saadi_lines)
        for w in self.hafez_words:
            self.hafez_probabs[w] = self.hafez_words[w] / self.hafez_word_number
        for w in self.saadi_words:
            self.saadi_probabs[w] = self.saadi_words[w] / self.saadi_word_number
    
    def get_max_result(self, hafez_estimation, saadi_estimation):
        if hafez_estimation >= saadi_estimation:
            return "hafez"
        elif hafez_estimation < saadi_estimation:
            return "saadi"
    
    def find_unknown_indexes(self, words):
        remove_index = []
        for i in range(len(words)):
            is_hafez = 1
            is_saadi = 1
            if words[i] not in self.hafez_words:
                is_hafez = 0
            if words[i] not in self.saadi_words:
                is_saadi = 0
            if (is_hafez == 0) and (is_saadi == 0):
                remove_index.append(i)
        return remove_index
        
    def estimate(self, text):
        words = text.split()
        hafez_estimation = 1
        saadi_estimation = 1
        remove_index = self.find_unknown_indexes(words)
        for index in range(len(words)):
            if index in remove_index:
                continue
            if words[index] in self.hafez_words:
                hafez_estimation *=  self.hafez_probabs[words[index]]
            if words[index] not in self.hafez_words:
                hafez_estimation *= self.hafez_probabs["fatemeh"]
            if words[index] in self.saadi_words:
                saadi_estimation *= self.saadi_probabs[words[index]]
            if words[index] not in self.saadi_words:
                saadi_estimation *= self.saadi_probabs["fatemeh"]
        saadi_estimation *= self.saadi_self_probab
        hafez_estimation *= self.hafez_self_probab
        return self.get_max_result(hafez_estimation, saadi_estimation)


In [4]:
t = Train(X_train)
t.get_words()
t.calculate_probabilities()

## Defining a test function

In [5]:
def test(X_test, t):
    correct_detected_hafez = 0
    correct_detected_saadi = 0
    all_hafez = 0 
    all_saadi = 0
    detected_hafez = 0
    detected_saadi = 0
    total = 0
    for i, j in X_test.iterrows():
        total += 1
        out = t.estimate(j.text)

        if j.label == "saadi":
            all_saadi += 1
            if out == "saadi":
                correct_detected_saadi += 1
                detected_saadi += 1

            if out == "hafez":
                detected_hafez += 1

        if j.label == "hafez":
            all_hafez += 1
            if out == "hafez":
                correct_detected_hafez += 1
                detected_hafez += 1
            if out == "saadi":
                detected_saadi += 1
    return correct_detected_hafez, correct_detected_saadi, all_hafez, all_saadi, detected_hafez, detected_saadi, total


## Calculating recall for each label

In [6]:
correct_detected_hafez,correct_detected_saadi,all_hafez,all_saadi,detected_hafez,detected_saadi,total = test(X_test, t)
saadi_Recall = correct_detected_saadi / all_saadi
hafez_Recall = correct_detected_hafez / all_hafez
print("saadi Recall is = ", saadi_Recall)
print("hafez Recall is = ", hafez_Recall)

saadi Recall is =  0.8194888178913738
hafez Recall is =  0.7347670250896058


## Calculating precision for each label

In [7]:
hafez_precision = correct_detected_hafez / detected_hafez
saadi_precision = correct_detected_saadi / detected_saadi
print("saadi precision is = ", saadi_precision)
print("hafez precision is = ", hafez_precision)

saadi precision is =  0.8221153846153846
hafez precision is =  0.7312722948870393


## Calculating accuracy 

In [8]:
accuracy = (correct_detected_hafez + correct_detected_saadi) / total
print(accuracy)

0.7855433221637147


In [14]:
eval_data = pd.read_csv("Data/evaluate.csv")
labels = []
for i, j in eval_data.iterrows():
    labels.append(t.estimate(j.text))

In [15]:
eval_data["estimated_label"] = labels
export_csv = eval_data.to_csv (r'Data/evaluate.csv', index = None, header=True)

## Implementing laplace smoothing 

In [16]:
class Laplas(Train):
     def calculate_probabilities(self):
        self.hafez_self_probab = self.hafez_lines / (self.hafez_lines + self.saadi_lines)
        self.saadi_self_probab = self.saadi_lines / (self.hafez_lines + self.saadi_lines)
        
        a = 0.4
        
        for w in self.hafez_words:
            self.hafez_probabs[w] = (self.hafez_words[w] + a) / (self.hafez_word_number + (a * len(self.hafez_words)))
        for w in self.saadi_words:
            self.saadi_probabs[w] = (self.saadi_words[w] + a) / (self.saadi_word_number + (a * len(self.saadi_words)))
        

In [17]:
l = Laplas(X_train)
l.get_words()
l.calculate_probabilities()

## Calculating recall for Train based on Laplace

In [18]:
correct_detected_hafez,correct_detected_saadi,all_hafez,all_saadi,detected_hafez,detected_saadi,total = test(X_test, l)
saadi_Recall = correct_detected_saadi / all_saadi
hafez_Recall = correct_detected_hafez / all_hafez
print("saadi Recall is = ", saadi_Recall)
print("hafez Recall is = ", hafez_Recall)

saadi Recall is =  0.8514376996805112
hafez Recall is =  0.7479091995221028


## Calculating precision for Train based on Laplace

In [19]:
hafez_precision = correct_detected_hafez / detected_hafez
saadi_precision = correct_detected_saadi / detected_saadi
print("saadi precision is = ", saadi_precision)
print("hafez precision is = ", hafez_precision)

saadi precision is =  0.8347689898198903
hafez precision is =  0.770935960591133


## Calculating accuracy for Train based on Laplace

In [20]:
accuracy = (correct_detected_hafez + correct_detected_saadi) / total
print(accuracy)

0.8099569171852561


In [21]:
data = pd.read_csv('Data/evaluate.csv');

In [22]:
data.drop(axis = 1 ,columns = 'text' , inplace = True)

In [23]:
data.to_csv("output.csv")