In [6]:
import numpy as np
import pandas as pd
import nltk
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from functions import handle_clean_text, map_sentiment
import pickle

STOP_WORDS = nltk.corpus.stopwords.words("english")
lemmatizer = nltk.WordNetLemmatizer()
tfidf = TfidfVectorizer()
le = LabelEncoder()
sc = StandardScaler()

In [7]:
# import Data
train_data = pd.read_csv("datasets/train.csv")
test_data = pd.read_csv("datasets/test.csv")

x_train = handle_clean_text(train_data["text"])
x_test = handle_clean_text(test_data["text"])

# transform data into TFIDF format
x_train_tfidf =  tfidf.fit_transform(x_train)
x_test_tfidf =  tfidf.transform(x_test)

# mapping target classes into numbers (1, 2, 3) 
y_train = train_data["sentiment"].apply(map_sentiment)
y_test = test_data["sentiment"].apply(map_sentiment)


In [8]:
# 1. Training Gaussian Naive Bayes model
GNB = GaussianNB()
GNB.fit(x_train_tfidf.toarray(), y_train)

GBN_y_predicted = GNB.predict(x_test_tfidf.toarray())

GNB_accuracy = accuracy_score(y_test, GBN_y_predicted)
print("Gaussian Naive Bayes model - accuracy score: ",GNB_accuracy)

Gaussian Naive Bayes model - accuracy score:  0.40492359932088284


In [9]:
# 2. Training Logistic Regression Model
lr = LogisticRegression(random_state=42, max_iter=500)

lr.fit(x_train_tfidf.toarray(), y_train)

lr_y_predicted = lr.predict(x_test_tfidf.toarray())

lr_accuracy = accuracy_score(y_test, lr_y_predicted)
print("Logistic Regression model - accuracy score: ",lr_accuracy)


Logistic Regression model - accuracy score:  0.6963780418788907


In [10]:
# Save trainned TFIDF into file
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)
# Save trainned Logistic Regression model into file
with open("logistic_regression_model.pkl", "wb") as f:
    pickle.dump(lr, f)
