In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import seaborn as sn
from wordcloud import WordCloud, STOPWORDS
import pickle

In [2]:
df = pd.read_csv('../data/interim/incarcerated_df.csv')
df.head()

Unnamed: 0,text,not_incarcerated
0,20 dollars reward. ranaway from the subscriber...,1
1,15 dollars reward will be paid for the apprehe...,1
2,baltimore august 11 1777. 10 pounds reward. ra...,1
3,90 dollars reward. eloped from the hermitage o...,1
4,50 dollars reward. ranaway from the subscriber...,1


In [3]:
x = df.text
y = df.not_incarcerated
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [4]:
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(x_train, y_train)
dummy.score(x_train, y_train)

0.8094736842105263

In [5]:
'''vectorizer = CountVectorizer()
vecs = vectorizer.fit_transform(x_train)
valid_vecs = vectorizer.transform(x_test)'''

'vectorizer = CountVectorizer()\nvecs = vectorizer.fit_transform(x_train)\nvalid_vecs = vectorizer.transform(x_test)'

In [6]:
'''x_form = TfidfTransformer()
tfidf = x_form.fit_transform(vecs)
valid_tfidf = x_form.transform(valid_vecs)'''

'x_form = TfidfTransformer()\ntfidf = x_form.fit_transform(vecs)\nvalid_tfidf = x_form.transform(valid_vecs)'

In [7]:
'''tfidf = TfidfVectorizer(stop_words='english', min_df = 150)
train_tfidf = tfidf.fit_transform(x_train)
test_tfidf = tfidf.transform(x_test)'''

"tfidf = TfidfVectorizer(stop_words='english', min_df = 150)\ntrain_tfidf = tfidf.fit_transform(x_train)\ntest_tfidf = tfidf.transform(x_test)"

In [8]:
#test_tfidf.shape

In [9]:
#clf_pipe = Pipeline([('tfidf', TfidfVectorizer(min_df=150)),
                      #   ('clf', MultinomialNB())])

In [10]:
clf_pipe = Pipeline([('vect', CountVectorizer(stop_words='english')),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])

In [11]:
param_grid = {
              'clf__alpha': (1e-1, 1e-2, 1e-3)
             }

In [12]:
clf_grid = GridSearchCV(clf_pipe, param_grid, n_jobs=-1)
clf_grid = clf_grid.fit(x_train, y_train)
print(clf_grid.best_score_)
best = clf_grid.best_estimator_
print(best)

0.991578947368421
Pipeline(steps=[('vect', CountVectorizer(stop_words='english')),
                ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB(alpha=0.1))])


In [13]:
best_preds = best.predict(x_test)
np.mean(best_preds == y_test)

0.9873817034700315

In [22]:
prob = best.predict_proba(x_test[0:1])

In [24]:
prob

array([[0.00209474, 0.99790526]])

In [33]:
pred = 'This enslaved person is on the run, with %{:.2f} probability' .format(prob[0][1] * 100)
pred

'This enslaved person is on the run, with %99.79 probability'

In [9]:
scores = cross_val_score(best, df.text, df.not_incarcerated, cv=5)
np.mean(scores)

0.9905293952880395

In [12]:
with open('../models/prediction_model.pkl','wb') as f:
    pickle.dump(best,f)

# load
with open('../models/prediction_model.pkl', 'rb') as f:
    clf2 = pickle.load(f)