In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import os
import sys

pd.options.display.max_columns = None
sys.path.append('../')
if os.path.abspath(os.pardir) not in sys.path:
    sys.path.append(os.path.abspath(os.pardir))

from preprocessing import Preprocessor
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

ROOT_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(ROOT_DIR, 'data', 'carer_emotion_dataset')
print(DATA_DIR)

label2int = {
  "sadness": 0,
  "joy": 1,
  "love": 2,
  "anger": 3,
  "fear": 4,
  "surprise": 5
}

/home/daphne/PycharmProjects/sentiment_task/data/carer_emotion_dataset


## Load dataset

In [2]:
training_data = pd.read_csv(os.path.join(DATA_DIR, 'training.csv'), encoding='utf8')
test_data = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'), encoding='utf8')
validation_data = pd.read_csv(os.path.join(DATA_DIR, 'validation.csv'), encoding='utf8')

dataset = pd.concat([training_data, test_data, validation_data], ignore_index=True)
dataset = dataset.sample(frac=1)
print(dataset.head())
print(dataset.shape)
print(dataset.columns)

                                                    text  label
18169  i begged her to come in the house with me when...      0
440    i feel indecisive on whether or not i feel the...      4
17785  i am feeling the past few days a little distre...      4
17844  i must add the crowd was similar to last night...      2
5675   i often used the word poggy when we were growi...      0
(20000, 2)
Index(['text', 'label'], dtype='object')


In [3]:
corpus = []
for i, row in dataset.iterrows():
    preprocessor = Preprocessor(row['text'])
    words_list = preprocessor.get_preprocessed_list_words()
    text = ' '.join(words_list)
    corpus.append(text)
print(corpus[0:10])
# vectorizer = TfidfVectorizer()
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

print(X.shape)
y = dataset['label'].copy()

['begged come house got back left right away feeling distinctly unwelcome', 'feel indecisive whether feel book huckleberry finn censored', 'feeling past day little distressed writing much', 'must add crowd similar last night except much laid back stoner feel supporting cast parent escaped kid', 'often used word poggy growing together feeling particularly ugly generally good day want stay bed hide outside world', 'feel totally listless exam come gone whole five month front uni free time', 'feel insulted', 'feeling particularly discouraged little weve seen lately decided needed stop negative instead refocus thought remember many thing grateful right', 'feel honored amazing son celebrate', 'feel messy room']
(20000, 15080)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)

print(f'Training mean accuracy: {train_score}')
print(f'Testing mean accuracy: {test_score}')

Training mean accuracy: 0.7371333333333333
Testing mean accuracy: 0.677


In [5]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[1327,   86,    0,    0,    0,    0],
       [  49, 1708,    0,    0,    1,    0],
       [  65,  279,   26,    0,    0,    0],
       [ 245,  252,    0,  198,    3,    0],
       [ 227,  223,    0,    7,  126,    0],
       [  61,  108,    0,    0,    9,    0]])