In [2]:
import math
import random
from collections import defaultdict
from pprint import pprint

In [3]:
# Prevent future/deprecation warnings from showing in output
import warnings
warnings.filterwarnings(action='ignore')

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [5]:
sns.set_style(style='white')
sns.set_context(context='notebook', font_scale=1.3, rc={'figure.figsize': (16,9)})

In [6]:
df = pd.read_csv('clean.csv')

In [7]:
df

Unnamed: 0.1,Unnamed: 0,title,text
0,0,1,farm veget farm fruit farm livestock farm vide...
1,1,1,victor mwamuy azolla farm kilifi counti joseph...
2,2,0,merci gakii muiruri head investor relat kenya ...
3,3,0,have lost subscrib first time more than decad ...
4,4,1,farm veget farm fruit farm livestock farm vide...
...,...,...,...
206,206,1,infospac ultim inform space thi profit carrot ...
207,207,0,american tech giant appl record fastest growth...
208,208,0,safaricom been alloc prime internet spectrum f...
209,209,0,ukrainian presid volodymyr zelenski left talk ...


In [8]:
from sklearn.model_selection import train_test_split

In [25]:
X = df.text
y = df.title
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
#max_features=2000,
#vect = CountVectorizer(max_features=2000, binary=True)
vect = CountVectorizer(binary=True,max_features=2000)

X_train_vect = vect.fit_transform(X_train)

In [28]:
counts = df.title.value_counts()
print(counts)

print("\nPredicting only 0 = {:.2f}% accuracy".format(counts[0] / sum(counts) * 100))
print("\nPredicting only 1 = {:.2f}% accuracy".format(counts[1] / sum(counts) * 100))

1    109
0    102
Name: title, dtype: int64

Predicting only 0 = 48.34% accuracy

Predicting only 1 = 51.66% accuracy


In [29]:
from sklearn.naive_bayes import MultinomialNB

In [30]:
nb = MultinomialNB()

nb.fit(X_train_vect, y_train)

nb.score(X_train_vect, y_train)

1.0

In [31]:
X_test_vect = vect.transform(X_test)

y_pred = nb.predict(X_test_vect)

y_pred

array([0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0],
      dtype=int64)

In [32]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [33]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred) * 100))
print("\nCOnfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 98.44%

F1 Score: 98.46

COnfusion Matrix:
 [[31  0]
 [ 1 32]]


In [34]:
import pickle

In [35]:
f = open('my_classifier.pickle', 'wb')

pickle.dump(nb, f)

f.close()

In [36]:
#import pickle

#f = open('my_classifier.pickle', 'rb')

#classifier = pickle.load(f)

#f.close()

In [46]:
testingd = pd.read_csv('testing.csv')

In [47]:
testingd

Unnamed: 0.1,Unnamed: 0,text,title
0,0,veterinari offic vaccin chicken . vaccin help ...,
1,1,veterinari offic vaccin chicken . vaccin help ...,
2,2,veterinari offic vaccin chicken . vaccin help ...,


In [48]:
A = testingd.text
b = testingd.title
A_train, A_test, b_train, b_test = train_test_split(A, b, test_size=0.30)

In [49]:
A_test_vect = vect.transform(A_test)

In [50]:
b_pred = nb.predict(A_test_vect)
b_pred

array([1], dtype=int64)

In [42]:
import joblib

In [43]:
# save the model to disk
filename = 'final.sav'
joblib.dump(nb, filename)

['final.sav']

In [44]:
C_test_vect = vect.transform(A_test)

In [51]:
d_pred = nb.predict(A_test_vect)
d_pred

array([1], dtype=int64)