In [9]:
import pickle


In [10]:
file = open("data.obj", 'rb')
df = pickle.load(file)
file.close()

In [15]:
example = df['Description']
example = example[0:1000]


In [16]:
example.to_csv('example.csv')

In [14]:
example

0                  AGGRAVATED: HANDGUN
1                     PAROLE VIOLATION
2              DOMESTIC BATTERY SIMPLE
3                               SIMPLE
4                       ARMED: HANDGUN
5                               SIMPLE
6              DOMESTIC BATTERY SIMPLE
7              DOMESTIC BATTERY SIMPLE
8         POSS: CANNABIS 30GMS OR LESS
9                               SIMPLE
10                          TO VEHICLE
11             HARASSMENT BY TELEPHONE
12            UNLAWFUL POSS OF HANDGUN
13                              SIMPLE
14                        RETAIL THEFT
15                      FORCIBLE ENTRY
16                          AUTOMOBILE
17                       FROM BUILDING
18                    RECKLESS CONDUCT
19                       FROM BUILDING
20                        RETAIL THEFT
22               STRONGARM - NO WEAPON
23       AGGRAVATED: OTHER DANG WEAPON
24                UNLAWFUL USE HANDGUN
25             DOMESTIC BATTERY SIMPLE
26             DOMESTIC B

In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [4]:
crimes = train['Primary Type']

In [5]:
crimeType = crimes.unique()

In [6]:
description = train['Description']

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(description)
X_train_counts.shape

(4816798, 490)

In [8]:

# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(4816798, 490)

In [9]:
# Machine Learning
# Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, crimes)

In [10]:

# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

model = text_clf.fit(description, crimes)

In [11]:

# Performance of NB Classifier
import numpy as np
#twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
testDescription = test['Description']
testCrimes = test['Primary Type']
predicted = model.predict(testDescription)
np.mean(predicted == testCrimes)

0.9301536289652882

In [12]:
# serialize model to JSON
# model_json = text_clf.to_json()
# with open("NaiveBais.json", "w") as json_file:
#     json_file.write(model_json)

# model.write.overwrite().save("naive-baise-model")

# from sklearn.externals import joblib
# joblib.dump(text_clf, 'naive-baise.pkl')
    

In [13]:
filehandler = open("model-naive-bais.obj", "wb")
pickle.dump(text_clf,filehandler)

In [52]:
# serialize weights to HDF5
# model2 = pickle.loads(s)

In [14]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(description, crimes)
predicted_svm = text_clf_svm.predict(testDescription)
np.mean(predicted_svm == testCrimes)



0.9279073243647235

In [15]:
filehandler = open("model-sgd-classifier.obj", "wb")
pickle.dump(text_clf_svm,filehandler)

In [26]:
# Grid Search
# Here, we are creating a list of parameters for which we would like to do performance tuning. 
# All the parameters name start with the classifier name (remember the arbitrary name we gave). 
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.

#laptop hags
# from sklearn.model_selection import GridSearchCV
# parameters = {'vect__ngram_range': [(1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}
# parameters = {'clf__alpha': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}

In [27]:

# Next, we create an instance of the grid search by passing the classifier, parameters 
# and n_jobs=-1 which tells to use multiple cores from user machine.

# gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
# gs_clf = gs_clf.fit(description, crimes)



KeyboardInterrupt: 

In [15]:
# To see the best mean score and the params, run the following code

# gs_clf.best_score_
# gs_clf.best_params_

# Output for above should be: The accuracy has now increased to ~90.6% for the NB classifier (not so naive anymore! 😄)
# and the corresponding parameters are {‘clf__alpha’: 0.01, ‘tfidf__use_idf’: True, ‘vect__ngram_range’: (1, 2)}.

{'clf__alpha': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}

In [None]:
# Similarly doing grid search for SVM
# from sklearn.model_selection import GridSearchCV
# parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

# gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
# gs_clf_svm = gs_clf_svm.fit(description, crimes)


# gs_clf_svm.best_score_
# gs_clf_svm.best_params_

In [None]:
# NLTK
# # Removing stop words
# from sklearn.pipeline import Pipeline
# text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), 
#                      ('clf', MultinomialNB())])