In [1]:
# resource used: https://towardsdatascience.com/sentiment-analysis-of-tweets-using-multinomial-naive-bayes-1009ed24276b

In [34]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [35]:
# constants
cwd = os.getcwd()

train_data_path = cwd + '/prop_data/train.csv'
test_data_path = cwd + '/prop_data/test.csv'
eval_data_path = cwd + '/prop_data/evaluation.csv'

In [36]:
# import data
df_train  = pd.read_csv(train_data_path)
df_test  = pd.read_csv(test_data_path)
df_eval  = pd.read_csv(eval_data_path)

In [37]:
# check how balanced the training dataset is
df_train.groupby(['score']).count()

Unnamed: 0_level_0,text
score,Unnamed: 1_level_1
0,3748
1,3752


In [38]:
import re
import string

# remove emojis: https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

# data preprocessing
def preprocess_text(text):
    # replacing url-s with the word 'url'
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', text)
    # replacing usernames-s with the word 'user'
    text = re.sub('@[^\s]+','USER', text)
    # converting text to lowercase
    text = text.lower()
    # remove HTML tags
    text = re.sub('<.*?>', '', text)
    # remove multiple spaces
    text = re.sub(' +',' ', text)
    # remove punctuation marks
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    # replacing numbers with the word 'number'
    text = re.sub(r'\w*\d+\w*', 'number', text)

    # text = deEmojify(text)
    
    return text.strip()

tr_data = [preprocess_text(t) for t in df_train.text]
test_data = [preprocess_text(t) for t in df_test.text]
eval_data = [preprocess_text(t) for t in df_eval.text]

In [39]:
# convert text data into a matrix of token counts
# ref: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
vectorizer = CountVectorizer(binary = True)

x_train = vectorizer.fit_transform(tr_data).toarray()
x_test = vectorizer.transform(test_data).toarray()
x_eval = vectorizer.transform(eval_data).toarray()

In [40]:
x_train.shape

(7500, 34017)

In [41]:
# transform a count matrix to a normalized tf-idf representation
# ref: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
transformer = TfidfTransformer()
x_train = transformer.fit_transform(x_train).toarray()
x_test = transformer.transform(x_test).toarray()
x_eval = transformer.transform(x_eval).toarray()

In [42]:
# get output labels
y_train = np.array(df_train.score)
y_test = np.array(df_test.score)
y_eval = np.array(df_eval.score)

In [43]:
# initialize and train classifier
classifier = MultinomialNB()
classifier.fit(x_train, y_train)

MultinomialNB()

In [44]:
# evaluation on test data
prediction_test = classifier.predict(x_test)
accuracy_test = np.mean(prediction_test==y_test)
print("Accuracy (test): " + str(accuracy_test * 100) + "%")

Accuracy (test): 83.36%


In [45]:
# evaluation on 'evaluation' data
prediction_eval = classifier.predict(x_eval)
accuracy_eval = np.mean(prediction_eval==y_eval)
print("Accuracy (evaluation): " + str(accuracy_eval * 100) + "%")

Accuracy (evaluation): 81.14%
