# 1. Data split train / test

In [1]:
import pandas as pd
dt = pd.read_csv("processed_data.csv")

In [2]:
dt = dt[dt["processed_text"].notnull()]

In [3]:
dt = pd.DataFrame(dt, columns=["label", "processed_text"])

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

cv = CountVectorizer()
tf = TfidfVectorizer()

In [5]:
# using count vectorizer
text = cv.fit_transform(dt["processed_text"])

In [6]:
# using tfidf vectorizer
tftext = tf.fit_transform(dt["processed_text"])

In [7]:
# split
x_train, x_test, y_train, y_test = train_test_split(tftext, dt["label"], test_size=0.2, random_state=1)

# 2. Naive Bayes

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
mnb = MultinomialNB()
bnb = BernoulliNB()

In [9]:
# multinomialNB
mnb.fit(x_train, y_train)
mnb_prediction = mnb.predict(x_test)

In [10]:
# multinomialNB scores
print("Accuracy score: {}".format(accuracy_score(y_test, mnb_prediction)))
print("Precision score: {}".format(precision_score(y_test, mnb_prediction)))
print("Recall score: {}".format(recall_score(y_test, mnb_prediction)))
print("f1 score: {}".format(f1_score(y_test, mnb_prediction)))

Accuracy score: 0.9291705498602051
Precision score: 0.9984152139461173
Recall score: 0.73512252042007
f1 score: 0.8467741935483871


In [11]:
# BernoulliNB
bnb.fit(x_train, y_train)
bnb_prediction = bnb.predict(x_test)

In [12]:
# BernoulliNB scores
print("Accuracy score: {}".format(accuracy_score(y_test, bnb_prediction)))
print("Precision score: {}".format(precision_score(y_test, bnb_prediction)))
print("Recall score: {}".format(recall_score(y_test, bnb_prediction)))
print("f1 score: {}".format(f1_score(y_test, bnb_prediction)))

Accuracy score: 0.9217148182665424
Precision score: 0.9590288315629742
Recall score: 0.7374562427071178
f1 score: 0.8337730870712401


# 3. Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [14]:
# Logistic Regression
lr.fit(x_train, y_train)
lr_prediction = lr.predict(x_test)

In [15]:
# Logistic Regression scores
print("Accuracy score: {}".format(accuracy_score(y_test, lr_prediction)))
print("Precision score: {}".format(precision_score(y_test, lr_prediction)))
print("Recall score: {}".format(recall_score(y_test, lr_prediction)))
print("f1 score: {}".format(f1_score(y_test, lr_prediction)))

Accuracy score: 0.9906803355079217
Precision score: 0.9758342922899885
Recall score: 0.9894982497082847
f1 score: 0.9826187717265353
