In [1]:
import math
import random
from collections import defaultdict
from pprint import pprint

In [2]:
# Prevent future/deprecation warnings from showing in output
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [4]:
sns.set_style(style='white')
sns.set_context(context='notebook', font_scale=1.3, rc={'figure.figsize': (16,9)})

In [6]:
# read cleaned data from file
df = pd.read_csv('clean.csv')
df

Unnamed: 0.1,Unnamed: 0,title,text
0,0,0,from left charl nasieku nation vice chair bahl...
1,1,1,editor will review what submit determin whethe...
2,2,0,vaccin most import invent field medicin fiona ...
3,3,0,wanjiku kibe faith muiruri ann wanjiku kibe ro...
4,4,0,have lost subscrib first time more than decad ...
...,...,...,...
206,206,1,time begin farm journey onli heifer need money...
207,207,1,infospac ultim inform space thi profit banana ...
208,208,0,kenya polic clifton miheso right celebr goal a...
209,209,0,simon mburu want avoid prostat cancer have wit...


## Split dataset to training and testing data

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X = df.text
y = df.title
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

## Exctract features from the dataset text

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
#max_features=2000,
#vect = CountVectorizer(max_features=2000, binary=True)
vect = CountVectorizer(binary=True,max_features=2000)

X_train_vect = vect.fit_transform(X_train)

## Examine accuracy of the extracted features 

In [13]:
counts = df.title.value_counts()
print(counts)

print("\nPredicting only 0 = {:.2f}% accuracy".format(counts[0] / sum(counts) * 100))
print("\nPredicting only 1 = {:.2f}% accuracy".format(counts[1] / sum(counts) * 100))

1    109
0    102
Name: title, dtype: int64

Predicting only 0 = 48.34% accuracy

Predicting only 1 = 51.66% accuracy


# MultinomialNB

In [14]:
from sklearn.naive_bayes import MultinomialNB

## Train the model

In [15]:
nb = MultinomialNB()

nb.fit(X_train_vect, y_train)

nb.score(X_train_vect, y_train)

0.9931972789115646

## Test the model

In [16]:
X_test_vect = vect.transform(X_test)

y_pred = nb.predict(X_test_vect)

y_pred

array([0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
      dtype=int64)

## Inspect the model performance

In [18]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [19]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred) * 100))
print("\nCOnfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 98.44%

F1 Score: 98.31

COnfusion Matrix:
 [[34  0]
 [ 1 29]]


# Use model to classify new input

In [30]:
predictingd = pd.read_csv('predicting.csv')
predictingd

Unnamed: 0.1,Unnamed: 0,text,title
0,0,veterinari offic vaccin chicken . vaccin help ...,
1,1,veterinari offic vaccin chicken . vaccin help ...,
2,2,veterinari offic vaccin chicken . vaccin help ...,


In [31]:
A = predictingd.text
b = predictingd.title
A_train, A_test, b_train, b_test = train_test_split(A, b, test_size=0.30)

In [32]:
A_test_vect = vect.transform(A_test)

## Predict

In [33]:
b_pred = nb.predict(A_test_vect)
b_pred

array([1], dtype=int64)