In [None]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from catboost import CatBoostClassifier
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
import re
from nltk.corpus import stopwords
import nltk
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("data/Twitter_Data.csv")

In [None]:
print(f"shape with NaN values: {data.shape}")
data = data.dropna()
data.shape
print(f"shape without NaN values: {data.shape}")

In [None]:
data.head()

In [None]:
reviews = ["-1", "0", "1"]
number_of_reviews = [data['category'].value_counts()[x] for x in range(-1, 2)]

#define Seaborn color palette to use
colors = sns.light_palette('seagreen')[0:3]

#create pie chart
plt.pie(number_of_reviews, labels = reviews, colors = colors, autopct='%.0f%%')
plt.show()

In [None]:
def delete_punctuation_from_string(s: str):
    return re.sub(r'[^\w\s]','', s)

In [None]:
data.loc[:, "clean_text"] = data["clean_text"].astype(str)
data.loc[:, "clean_text"] = data["clean_text"].apply(delete_punctuation_from_string)

In [None]:
train, test = train_test_split(data, random_state=239)

In [None]:
bow = CountVectorizer()

x_train = bow.fit_transform(train["clean_text"])
x_test = bow.transform(test["clean_text"])
y_train = train["category"]
    
model = CatBoostClassifier(verbose=0, task_type="GPU")
model.fit(x_train, y_train)

y_test = test["category"]

y_pred = model.predict(x_test)

balanced_accuracy_score(y_pred, y_test)

In [None]:
l = WordNetLemmatizer()

def lemmatize(x):
    x = map(lambda r:  ' '.join([l.lemmatize(i.lower()) for i in r.split()]), x)
    x = np.array(list(x))
    return x

stop_words = stopwords.words('english')

def delete_stop_word(s):
    words = s.split()
    return " ".join([word for word in words if word.lower() not in stop_words])

In [None]:
# apply lemmatization

train["clean_text"] = lemmatize(train["clean_text"])
test["clean_text"] = lemmatize(test["clean_text"])

In [None]:
# delete stop words

train["clean_text"] = train["clean_text"].apply(delete_stop_word)
test["clean_text"] = test["clean_text"].apply(delete_stop_word)

In [None]:
# Choose hyperparameters (TODO)

parameters = {"depth": [5],
          "iterations": [500],
          "learning_rate": [0.1]
}

In [None]:
bow = CountVectorizer()

x_train = bow.fit_transform(train["clean_text"])
x_test = bow.transform(test["clean_text"])
y_train = train["category"]

# clf = GridSearchCV(CatBoostClassifier(verbose=0), parameters)
clf = CatBoostClassifier(iterations=500, depth=5, learning_rate=0.1, verbose=0)
clf.fit(x_train, y_train)

y_test = test["category"]

y_pred = clf.predict(x_test)

In [None]:
balanced_accuracy_score(y_pred, y_test) # parameters are bad.

In [None]:
reviews = ["AMAZING I LOVE YOU ALL THERE", "I used your service once, but it was terrible"]

reviews = lemmatize(reviews)
reviews = bow.transform(reviews)

clf.predict(reviews)

# SAVE CATBOOST MODEL

In [1]:
import pickle

with open("models/model_catboost", 'wb') as model_file:
    pickle.dump((model, bow), model_file)

NameError: name 'model' is not defined

# USAGE

In [None]:
from interactor import Interactor

In [None]:
ModelsObj = Interactor(models_path="models")

In [None]:
ModelsObj.predict(model_name="catboost", sentence="AMAZING I LOVE YOU ALL THERE")