In [1]:
import math
import random
from collections import defaultdict
from pprint import pprint

In [2]:
# Prevent future/deprecation warnings from showing in output
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [4]:
sns.set_style(style='white')
sns.set_context(context='notebook', font_scale=1.3, rc={'figure.figsize': (16,9)})

In [33]:
# read cleaned data from file
df = pd.read_csv('clean.csv')
df

Unnamed: 0.1,Unnamed: 0,title,text
0,0,0,caffein addict caus peopl becom irrit moodi wh...
1,1,1,avocado versatil fruit that eaten savouri swee...
2,2,1,editor will review what submit determin whethe...
3,3,0,famili member rel passeng board twin otter air...
4,4,0,kenya digit economi expect gener percent total...
...,...,...,...
208,208,0,shokupan japanes bakeri sarit center photo wen...
209,209,1,infospac ultim inform space thi avocado farm k...
210,210,1,editor will review what submit determin whethe...
211,211,0,kenya abel kipsang lead pack dure event dure i...


## Split dataset to training and testing data

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
X = df.text
y = df.title
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

## Exctract features from the dataset text

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
#max_features=2000,
#vect = CountVectorizer(max_features=2000, binary=True)
vect = CountVectorizer(binary=False,max_features=3500,min_df=2,max_df=0.5)

X_train_vect = vect.fit_transform(X_train)

## Examine accuracy of the extracted features 

In [38]:
counts = df.title.value_counts()
print(counts)

print("\nPredicting only 0 = {:.2f}% accuracy".format(counts[0] / sum(counts) * 100))
print("\nPredicting only 1 = {:.2f}% accuracy".format(counts[1] / sum(counts) * 100))

1    110
0    103
Name: title, dtype: int64

Predicting only 0 = 48.36% accuracy

Predicting only 1 = 51.64% accuracy


# MultinomialNB

In [39]:
from sklearn.naive_bayes import MultinomialNB

## Train the model

In [40]:
nb = MultinomialNB()

nb.fit(X_train_vect, y_train)

nb.score(X_train_vect, y_train)

1.0

## Test the model

In [41]:
X_test_vect = vect.transform(X_test)

y_pred = nb.predict(X_test_vect)

y_pred

array([1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1],
      dtype=int64)

## Inspect the model performance

In [42]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [43]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred) * 100))
print("\nCOnfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 100.00%

F1 Score: 100.00

COnfusion Matrix:
 [[30  0]
 [ 0 34]]


## save model

In [44]:
from joblib import dump, load

In [45]:
dump(nb, 'newnbmodel.joblib')

['newnbmodel.joblib']

## load model and use it

In [46]:
my_nb = load('nbmodel.joblib')

# Use model to classify new input

In [19]:
predictingd = pd.read_csv('predicting.csv')
predictingd

Unnamed: 0.1,Unnamed: 0,text,title
0,0,veterinari offic vaccin chicken . vaccin help ...,
1,1,veterinari offic vaccin chicken . vaccin help ...,
2,2,veterinari offic vaccin chicken . vaccin help ...,


In [20]:
A = predictingd.text
b = predictingd.title
A_train, A_test, b_train, b_test = train_test_split(A, b, test_size=0.30)

In [21]:
A_test_vect = vect.transform(A_test)

## Predict

In [26]:
b_pred = nb.predict(A_test_vect)
b_pred

array([1], dtype=int64)

## Using loaded model

In [23]:
my_pred = my_nb.predict(A_test_vect[0:1])

In [27]:
my_pred

array([1], dtype=int64)

In [28]:
if my_pred == 1:
    print('Agriculture')
else:
    print('Not agriculture')

Agriculture
