In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import os
import sys
import numpy as np

pd.options.display.max_columns = None
sys.path.append('../')
if os.path.abspath(os.pardir) not in sys.path:
    sys.path.append(os.path.abspath(os.pardir))

from preprocessing import Preprocessor
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pickle as pkl

from keras.models import Sequential
from keras import layers
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow as tf

import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings('ignore')

ROOT_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(ROOT_DIR, 'data', 'carer_emotion_dataset')
print(DATA_DIR)

label2int = {
  "sadness": 0,
  "joy": 1,
  "love": 2,
  "anger": 3,
  "fear": 4,
  "surprise": 5
}

2021-12-17 18:27:17.401663: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-17 18:27:17.401686: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


/home/daphne/PycharmProjects/sentiment_task/data/carer_emotion_dataset


## Load dataset

In [2]:
training_data = pd.read_csv(os.path.join(DATA_DIR, 'training.csv'), encoding='utf8')
test_data = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'), encoding='utf8')
validation_data = pd.read_csv(os.path.join(DATA_DIR, 'validation.csv'), encoding='utf8')

dataset = pd.concat([training_data, test_data, validation_data], ignore_index=True)
# dataset = dataset.sample(frac=1)
dataset = dataset.reindex(np.random.permutation(dataset.index)) # shuffle the dataset
print(dataset.head())
print(dataset.shape)
print(dataset.columns)
print(dataset.label.unique())

                                                    text  label
3216   im not afraid of going on my own but i feel li...      1
1409   i feel myself getting agitated over something ...      3
10356  i didnt feel threatened or concerned really bu...      4
7393   i was still feeling crappy but hoped it was ju...      0
12083  i feel even more strongly now that this can be...      1
(20000, 2)
Index(['text', 'label'], dtype='object')
[1 3 4 0 2 5]


In [4]:
corpus = []
for i, row in dataset.iterrows():
    preprocessor = Preprocessor(row['text'])
    words_list = preprocessor.get_preprocessed_list_words()
    text = ' '.join(words_list)
    corpus.append(text)
print(corpus[0:10])
# vectorizer = TfidfVectorizer()
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(X.shape)
y = dataset['label'].copy()

['feel rich comment', 'want really love book social thought provoke personal history thing leave feeling disappointed one', 'left office feeling discourage', 'swear make feel lot good', 'feel selfish time want escape day day feel like think kid take instead', 'feel delight stay manila near end feel reason', 'havent feel homesick know get together enjoy mum cook make want teleporter', 'im feel rebellious need something relieve turmoil body', 'feel capture peaceful serenity relax invite pine lake', 'feel lethargic find reason move even full bladder threaten burst']
['aa' 'aaaaaaand' 'aaaaand' ... 'zum' 'zumba' 'zz']


TypeError: unhashable type: 'slice'

In [6]:
vocab = vectorizer.vocabulary_
print(vocab)

pickle_out = open(os.path.join(ROOT_DIR, 'models/multinomial_nb_vocabulary.pkl'), 'wb')
pkl.dump(vocab, pickle_out)
pickle_out.close()




In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

bayes_model = MultinomialNB()
bayes_model.fit(X_train, y_train)
y_pred = bayes_model.predict(X_test)
train_score = bayes_model.score(X_train, y_train)
test_score = bayes_model.score(X_test, y_test)

print(f'Training mean accuracy: {train_score}')
print(f'Testing mean accuracy: {test_score}')

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
scores = cross_val_score(bayes_model, X_train, y_train, cv = 10, scoring='accuracy')

print('Cross-validation scores:{}'.format(scores))

pickle_out = open(os.path.join(ROOT_DIR,'models/multinomial_nb_model.pkl'), 'wb')
pkl.dump(bayes_model, pickle_out)
pickle_out.close()

In [None]:
y_train_modified = to_categorical(y_train)
y_test_modified = to_categorical(y_test)
print(y_train.shape)
print(y_train_modified)

In [None]:
input_dim = X_train.shape[1]

tf.keras.backend.clear_session()
model = Sequential()
model.add(layers.Dense(32, input_dim=input_dim, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(6, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
mc = ModelCheckpoint(os.path.join(ROOT_DIR, 'models/best_model_dnn_bow.h5'), monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
history = model.fit(X_train.todense(), y_train_modified,
                    epochs=20,
                    batch_size=1024,
                    validation_split=0.3,
                    callbacks=[mc, es])

In [None]:
best_model = tf.keras.models.load_model(os.path.join(ROOT_DIR, 'models/best_model_dnn_bow.h5'))

loss, accuracy = best_model.evaluate(X_train.todense(), y_train_modified, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = best_model.evaluate(X_test.todense(), y_test_modified, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

plot_history(history)