Importing the Libraries

In [None]:
import nltk
!pip install smart_open
from smart_open import open
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import *
from textblob.classifiers import NaiveBayesClassifier
from sklearn.model_selection import KFold
from nltk.classify.naivebayes import NaiveBayesClassifier
from gensim import corpora, models, similarities
from nltk.corpus import wordnet as wn
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

Reading the Dataset

In [None]:
#Read dataset ISEAR.csv - Dataset
Dataset = pd.read_csv('ISEAR_raw.csv',header=None)

#Emotions to be detected - det_emo
det_emo = ['anger', 'disgust', 'fear', 'guilt', 'joy', 'sadness', 'shame']

#Negation words - neg_words
neg_words = ['not', 'neither', 'nor', 'never', 'but', 'however', 'although', 'nonetheless', 'despite', 'except', 'even though', 'yet']

Displaying Dataset

In [None]:
Dataset.head(10)

Unnamed: 0,0,1,2
0,joy,On days when I feel close to my partner and ot...,
1,fear,Every time I imagine that someone I love or I ...,
2,anger,When I had been obviously unjustly treated and...,
3,sadness,When I think about the short time that we live...,
4,disgust,At a gathering I found myself involuntarily si...,
5,shame,When I realized that I was directing the feeli...,
6,guilt,I feel guilty when when I realize that I consi...,
7,joy,After my girlfriend had taken her exam we went...,
8,fear,"When, for the first time I realized the meanin...",
9,anger,When a car is overtaking another and I am forc...,


Cleaning the Dataset, Stemming, POS-TAGGER

In [None]:
#Removes unnecessary characters from sentences
#Cleaning of Data
def removal(sentences):
  sentence_list = []
  count = 0
  sent = nltk.word_tokenize(sentences)
  chars = ["รก", "\xc3", "\xa1", "\n", ",", ".", "[", "]", ""]
  clean_list = []
  for i in sent:
    if i not in chars:
      clean_list.append(i)
  return clean_list


#POS-TAGGER and returns NAVA words
def pos_tag(sentences):
  tags = [] #have the pos tag included
  nava_sen = []
  pt = nltk.pos_tag(sentences)
  nava = []
  nava_words = []
  for t in pt:
    if t[1].startswith('NN') or t[1].startswith('JJ') or t[1].startswith('VB') or t[1].startswith('RB'):
      nava.append(t)
      nava_words.append(t[0])
  return nava, nava_words


#Performs stemming
def stemming(sentences):
  sent_list = []
  sent_string = []
  sent_token = []
  stemmer = PorterStemmer()
  #temp = 0
  #temp += 1
  temp = 1
  st = ""
  for word in sentences:
    word_lower = word.lower()
    if len(word_lower) >= 3:
      st += stemmer.stem(word_lower) + " "
  sent_string.append(st)
  word_set = nltk.word_tokenize(st)
  sent_token.append(word_set)
  word_text = nltk.Text(word_set)
  sent_list.append(word_text)
  return word_text, st, word_set
#   return sentence_list, sen_string, sen_token


'''def removal(input_text):
    a=[]
    a1 = re.sub(r'@\w+', '', input_text)
    a2 = re.sub(r'http.?://[^\s]+[\s]?', '', a1)
    punct = string.punctuation
    trantab = str.maketrans(punct, len(punct)*' ')  # Every punctuation symbol will be replaced by a space
    a3 = a2.translate(trantab)
    a4 = re.sub('\d+', '', a3)
    a4 = a4.lower()
    return a4'''

"def removal(input_text):\n    a=[]\n    a1 = re.sub(r'@\\w+', '', input_text)\n    a2 = re.sub(r'http.?://[^\\s]+[\\s]?', '', a1)\n    punct = string.punctuation\n    trantab = str.maketrans(punct, len(punct)*' ')  # Every punctuation symbol will be replaced by a space\n    a3 = a2.translate(trantab)\n    a4 = re.sub('\\d+', '', a3)\n    a4 = a4.lower()\n    return a4"

Functions

In [None]:
#Write to file
def write_to_file(filename, text):
  o = open(filename,'w')
  o.write(str(text))
  o.close()


#Reads the emotion representative words file
def readfile(filename):
  f = open(filename,'r')
  representative_words = []
  for line in f.readlines():
    characters = ["\n", " ", "\r", "\t"]
    new = ''.join([i for i in line if not [e for e in characters if e in i]])
    representative_words.append(new)
  return representative_words


#Makes a list of all words semantically related to an emotion and Stemming
def affect_wordlist(words):
  affect_words = []
  stemmer = PorterStemmer()
  for w in words:
    w_l = w.lower()
    word_stem = stemmer.stem(w_l)
    if word_stem not in affect_words:
      affect_words.append(word_stem)
  return affect_words


#Creating an emotion wordset
def emotion_word_set(emotions):
  word_set = {}
  for e in emotions:
    representative_words = readfile(e)
    wordlist = affect_wordlist(representative_words)
    word_set[e] = wordlist
  return word_set


#Emotion Detector - Getting synonyms from wordnet synsets
def get_synonyms():
  syn = {}
  for e in emotion_labels:
    jw = wn.synsets(e)
    for s in jw:
      v = s.name()
      try:
        syn[e].append(wn.synset(v).lemma_names())
      except KeyError:
        syn[e] = wn.synset(v).lemma_names()

                
#Emotion Detector - Creating training/testing set for Naive Bayes classifier TextBlob -- Not used
def create_dataset_textblob(sentences, emotions):
  train = []
  sen = []
  emo = []
  for s in sentences:
    sen.append(s)
  for e in emotions:
    emo.append(e)
  for i in range(len(sen)):
    s = sen[i]
    e = emo[i]
    train.append((str(s), e))
  return train


#Emotion Detector - Creating training/testing set for Naive Bayes classifier TextBlob -- Not used
def create_dataset_textblob(sentences, emotions):
  train = []
  sen = []
  emo = []
  for s in sentences:
    sen.append(s)
  for e in emotions:
    emo.append(e)
  for i in range(len(sen)):
    s = sen[i]
    e = emo[i]
    train.append((str(s), e))
  return train


#Create dataset for nltk Naive Bayes
def create_data(sentence, emotion):
  data = []
  for i in range(len(sentence)):
    sen = []
    for s in sentence[i]:
      sen.append(str(s))
    emo = emotion[i]
    data.append((sen, emo))
  return data


#Get all words in dataset
def get_words_in_dataset(dataset):
  all_words = []
  for (words, sentiment) in dataset:
    all_words.extend(words)
  return all_words


#Getting frequency dist of words
def get_word_features(wordlist):
  wordlist = nltk.FreqDist(wordlist)
  word_features = wordlist.keys()
  return word_features


#Testing for Naive Bayes Classifier
def testing(cl, test):
  for s, e in test:
    r = cl.classify(s)
    print(s, e, r)
    if r == e:
      print("*")
            
            
#Extacting features
def extract_features(document):
  document_words = set(document)
  features = {}
  for word in word_features:
    features['contains(%s)' % word] = (word in document_words)
  return features


#Create test data
def create_test(sentence, emotion):
  data = []
  sen = []
  emo = []
  for s in sentence:
    sen.append(str(s))
  for e in emotion:
    emo.append(e)
  for i in range(len(sen)):
    temp = []
    temp.append(sen[i])
    temp.append(emo[i])
    data.append(temp)
  return data

Creating the Dataframe

In [None]:
#Creating the dataframe
def create_frame(Data):
  labels = []
  sen = []
  sen_str = []
  sen_tok = []
  labelset = []
  for i in range(len(Data)):
    if i >= 0:
      emotion = Data[0][i]
      data_toclean = Data[1][i]
      labels.append(emotion)
      labelset.append([emotion])
      sent = removal(data_toclean)
      nava, sent_pt = pos_tag(sent)
      sentences, sen_string, sen_token = stemming(sent_pt)
      sen.append(sentences)
      sen_str.append(sen_string)
      sen_tok.append(sen_token)
  df = pd.DataFrame({0 : labels,
                        1 : sen,
                        2 : sen_str,
                        3 : sen_tok,
                        4 : labelset})
  return df, sen_tok, labels, sen_str

Displaying result of calling create_frame function

In [None]:
#Calling the create_frame function
c, st, labels, review_sent = create_frame(Dataset)

In [None]:
c

Unnamed: 0,0,1,2,3,4
0,joy,"(day, feel, close, partner, other, friend, fee...",day feel close partner other friend feel peac ...,"[day, feel, close, partner, other, friend, fee...",[joy]
1,fear,"(time, imagin, someon, love, contact, seriou, ...",time imagin someon love contact seriou ill eve...,"[time, imagin, someon, love, contact, seriou, ...",[fear]
2,anger,"(had, been, obvious, unjustli, treat, had, pos...",had been obvious unjustli treat had possibl el...,"[had, been, obvious, unjustli, treat, had, pos...",[anger]
3,sadness,"(think, short, time, live, relat, period, life...",think short time live relat period life think ...,"[think, short, time, live, relat, period, life...",[sadness]
4,disgust,"(gather, found, involuntarili, sit, next, peop...",gather found involuntarili sit next peopl expr...,"[gather, found, involuntarili, sit, next, peop...",[disgust]
...,...,...,...,...,...
7511,shame,"(year, back, someon, invit, tutor, grand-daugh...",year back someon invit tutor grand-daught gran...,"[year, back, someon, invit, tutor, grand-daugh...",[shame]
7512,shame,"(had, taken, respons, someth, had, prepar, how...",had taken respons someth had prepar howev fail...,"[had, taken, respons, someth, had, prepar, how...",[shame]
7513,fear,"(wa, home, heard, loud, sound, spit, door, tho...",wa home heard loud sound spit door thought fam...,"[wa, home, heard, loud, sound, spit, door, tho...",[fear]
7514,guilt,"(did, not, homework, teacher, had, ask, wa, sc...",did not homework teacher had ask wa scold immedi,"[did, not, homework, teacher, had, ask, wa, sc...",[guilt]


Defining Function for Classifier

In [None]:
#Classifier
def classify_dataset(data):
  return classifier.classify(extract_features(nltk.word_tokenize(data)))


#Get accuracy
def get_accuracy(test_data, classifier):
  total = accuracy = float(len(test_data))
  for data in test_data:
    if classify_dataset(data[0]) != data[1]:
      accuracy -= 1
  #print('Accuracy with Naive Bayes Classifier is: (%d/20) = %f%%.' % (accuracy, accuracy / total * 100))
  result = accuracy / total * 100
  return result

Training and Testing

In [None]:
# Create training and testing data
sen = c[3]
emo = c[0]
l = len(c[3])
limit = (9*l)//10
sente = c[2]
Data = create_data(sen[:limit], emo[:limit])
test_data = create_test(sente[limit:], emo[limit:])


# extract the word features out from the training data
word_features = get_word_features(get_words_in_dataset(Data))


# get the training set and train the Naive Bayes Classifier
training_set = nltk.classify.util.apply_features(extract_features, Data)
classifier = NaiveBayesClassifier.train(training_set)

Final result ( accuracy )

In [None]:
# Calling get accuracy function to know Accuracy
res = get_accuracy(test_data, classifier)
print("Accuracy using Naive Bayes Component  ", res, "%")

Accuracy using Naive Bayes Component   63.16489361702128 %
