In [17]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [18]:
# load dataset
data_path = "data/training_data.csv"
d_set = pd.read_csv(data_path,names=["Title",'Category','SubCategory'])
d_set.head(10) # first ten records

Unnamed: 0,Title,Category,SubCategory
0,Bioinformatics: The Machine Learning Approach,Engineering,BioTechnology
1,Fundamentals of Inorganic Chemistry: An Introd...,Mathematics & Sciences,Chemistry
2,"The Construction of Buildings, Volume 1",Engineering,Civil Engineering
3,Combinatorics: Theory and Applications,Mathematics & Sciences,Mathematics & Statistics
4,Basic Principle Measurements & Control System,Engineering,Mechanical Engineering
5,Software Testing,Engineering,Computer Science Engineering
6,"Distributed Computing: Principles, Algorithms...",Engineering,Computer Science Engineering
7,Quantum Mechanics for Scientists and Engineers,Mathematics & Sciences,Physics
8,Introduction to Information Retrieval,Engineering,Computer Science Engineering
9,KEY TO MECHANICS OF STRUCTURES VOL. I,Engineering,Civil Engineering


In [19]:
# examine the shape
d_set.shape

(51641, 3)

In [20]:
# define X and y for using with countVectorizer
X = d_set['Title'] # 1 - Dimensiona;
y1 = d_set['Category'] #  Dimensional
y2 = d_set['SubCategory'] #  Dimensional
print X.shape
print y1.shape
print y2.shape

(51641,)
(51641,)
(51641,)


In [21]:
# spllit X and y into training and testing set
from sklearn.cross_validation import train_test_split
X_train , X_test , y1_train, y1_test = train_test_split(X, y1, random_state=1)
X_train , X_test , y2_train, y2_test = train_test_split(X, y2, random_state=1)
print X_train.shape,X_test.shape
print "Category"
print (y1_train.shape,y1_test.shape)
print "SubCategory"
print (y1_train.shape,y1_test.shape)

(38730,) (12911,)
Category
((38730,), (12911,))
SubCategory
((38730,), (12911,))


### Vectorizing Our Dataset

In [22]:
# instatntiate the vectorizer
vect = CountVectorizer()

In [23]:
# Learn training data vocabulary and then use it to create document term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [24]:
# examine the document train matrix
X_train_dtm

<38730x16205 sparse matrix of type '<type 'numpy.int64'>'
	with 211071 stored elements in Compressed Sparse Row format>

In [25]:
# transform testing data using fitted vocabulary into document term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<12911x16205 sparse matrix of type '<type 'numpy.int64'>'
	with 67675 stored elements in Compressed Sparse Row format>

### Building and Evaluating Model

In [26]:
# import and instatiate Multinomial Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
nb = MultinomialNB()

### Predicting Category

In [27]:
# train the models using X_train_dtm and Category - column Vector
%time nb.fit(X_train_dtm,y1_train)

CPU times: user 179 ms, sys: 0 ns, total: 179 ms
Wall time: 178 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [28]:
# make class predictions for X_test_dtm
y1_pred_class = nb.predict(X_test_dtm)

In [29]:
# calculate the accuracy of class predictions
# y1_test : true values
# y1_pred_class - predicted values
metrics.accuracy_score(y1_test, y1_pred_class)

0.83301061110680819

In [30]:
# print the confusion matrix
metrics.confusion_matrix(y1_test,y1_pred_class)

array([[2017,  135,    6,  126,    3,   44,   51,    2],
       [ 138, 1518,    2,  190,    1,   26,   16,    2],
       [   5,   14,  184,   11,    0,   15,   10,    1],
       [  65,  156,    8, 4052,    2,  144,   28,    1],
       [  36,   65,    0,    8,  135,    1,    2,    0],
       [  75,   46,    1,  337,    0, 1340,   55,    1],
       [  54,   21,    3,   67,    1,   43, 1491,    0],
       [  44,   52,    0,   22,    0,    3,   17,   18]])

In [31]:
# Testing Category
test = ["The monk Who sold his ferrari"]
test_dtm = vect.transform(test)

In [32]:
ans = nb.predict(test_dtm.toarray())
print ans

['Arts & Humanities']


### Predicting SubCategory

In [33]:
# train the models using X_train_dtm and Category - column Vector
%time nb.fit(X_train_dtm,y2_train)

CPU times: user 330 ms, sys: 35.8 ms, total: 366 ms
Wall time: 365 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [34]:
# make class predictions for X_test_dtm
y2_pred_class = nb.predict(X_test_dtm)

In [35]:
# calculate the accuracy of class predictions
# y2_test : true values
# y2_pred_class - predicted values
metrics.accuracy_score(y2_test, y2_pred_class)

0.54976376733018362

In [36]:
# print the confusion matrix
metrics.confusion_matrix(y2_test,y2_pred_class)

array([[69,  0,  0, ...,  0,  0,  0],
       [ 0,  2,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ..., 
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])

In [37]:
# Testing SubCategory
test = ["The monk Who sold his ferrari"]
test_dtm = vect.transform(test)

In [38]:
ans = nb.predict(test_dtm.toarray())
print ans

['Management']
