This is a simple baseline for Pizza project

In [153]:
# This tells matplotlib not to try opening a new window for each plot.
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

# pandas for reading json file
import pandas as pd


In [154]:
data_all = pd.read_json('data/train.json')

In [155]:
data_title = data_all.request_title
data_text = data_all.request_text_edit_aware
data_label = data_all.requester_received_pizza

train_data_title = data_title[0:1000]
train_data_text = data_text[0:1000]
train_labels = data_label[0:1000]

test_data_title = data_title[1000:2000]
test_data_text = data_text[1000:2000]
test_labels = data_label[1000:2000]


In [156]:
def getCountVectorizerStats(cv, train_data):
    #fit using training data
    output = cv.fit_transform(train_data)
    numSample, numFeatuer = output.shape
    print "[n_samples, n_features]: %d, %d" % (numSample,numFeatuer)
    print "size of vocabulary: %d" % (numFeatuer)
    
    nonZeroCount = output.nnz
    print "the count of explicitly-stored values (nonzeros): %d" % (nonZeroCount)
    print "the average number of non-zero features per sample: %.2f" % (1.0*nonZeroCount/numSample)
    print "fraction of the entries in the matrix are non-zero: %.2f percent" % (100.0*nonZeroCount/(numSample*numFeatuer))
    
    featureNames = cv.get_feature_names()
    print 'first feature name:', featureNames[0], '\n\rlast feature name:', featureNames[len(featureNames)-1]

In [157]:
def P1(train_data):
    
    #using content because the input is expected to be the sequence strings or bytes items 
    print '-------------using default vocabulary'
    cv1 = CountVectorizer(input='content')
    getCountVectorizerStats(cv1, train_data)
    
    print '-------------using 1-3 ngram with min_df = 10'
    cv5 = CountVectorizer(input='content', analyzer = 'word', ngram_range=(1,3), min_df=10)
    getCountVectorizerStats(cv5,train_data)
    
P1(train_data_title)

-------------using default vocabulary
[n_samples, n_features]: 1000, 2070
size of vocabulary: 2070
the count of explicitly-stored values (nonzeros): 11567
the average number of non-zero features per sample: 11.57
fraction of the entries in the matrix are non-zero: 0.56 percent
first feature name: 000 
last feature name: zucchini
-------------using 1-3 ngram with min_df = 10
[n_samples, n_features]: 1000, 290
size of vocabulary: 290
the count of explicitly-stored values (nonzeros): 9710
the average number of non-zero features per sample: 9.71
fraction of the entries in the matrix are non-zero: 3.35 percent
first feature name: about 
last feature name: you


In [158]:
def preditStats(preds, labels):
    correct, total = 0, 0
    for pred, label in zip(preds, labels):
        if pred == label: correct += 1
        total += 1
    print 'total: %3d  correct: %3d  accuracy: %3.2f' %(total, correct, 1.0*correct/total)
    f1 = metrics.f1_score(labels, preds, average='binary')
    print 'metrics.f1_score:', f1
    return f1

def myKNN(cv_train_data, cv_test_data, k=1):
    print '-------------using KNeighborsClassifier with k=', k
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(cv_train_data, train_labels)
    #predict test labels using test data
    preds = clf.predict(cv_test_data)
    preditStats(preds, test_labels)
    return preds

def myMultinomialNB(cv_train_data, cv_test_data, a = 1.0):
    print '-------------using MultinomialNB with alpha=', a
    nb = MultinomialNB(alpha=a)
    nb.fit(cv_train_data, train_labels)
    #predict test labels using test data
    preds = nb.predict(cv_test_data)
    preditStats(preds, test_labels)
    return preds
    
def myLogisticRegression(cv_train_data, cv_test_data, c):
    print '-------------using LogisticRegression with penalty=l2 and C:', c
    lr = LogisticRegression(penalty='l2', C=c)
    lr.fit(cv_train_data, train_labels)
    preds = lr.predict(cv_test_data)
    preditStats(preds, test_labels)
    print 'lr.coef_.shape:',lr.coef_.shape
    sumOfSquareAllClasses = []
    for iClass in range(lr.coef_.shape[0]):
        # for class i
        sumOfSquare = 0
        for jFeature in range(lr.coef_.shape[1]): 
            #for feature j
            sumOfSquare += lr.coef_[iClass][jFeature]*lr.coef_[iClass][jFeature]
        sumOfSquareAllClasses.append(sumOfSquare)
    print 'sumOfSquareAllClasses', sumOfSquareAllClasses
    return preds
    

In [159]:
def empty_preprocessor(s):
    return s

def better_preprocessor(s):
    temp = s.lower().strip()
    #tried return re.sub(r'[\W_]+', '', temp) but result is not as good as only lower and strip
    return temp
    
def P2(train_data, test_data, numOfRun = 3):
    cv1 = CountVectorizer(input='content', analyzer = 'word', ngram_range=(1,1), preprocessor = better_preprocessor)
    cv_train_data = cv1.fit_transform(train_data)
    print 'train_data.shape', train_data.shape
    print 'cv_train_data.shape:', cv_train_data.shape
    
    #save feature names, which will be used later as vocabulary
    featureNames = cv1.get_feature_names()
    print 'len(featureNames):', len(featureNames)
    
    #use the same vocabulary to prepare test data
    cv2 = CountVectorizer(input='content', vocabulary=featureNames)
    cv_test_data = cv2.fit_transform(test_data)
    print 'test_data.shape', test_data.shape
    print 'cv_test_data.shape', cv_test_data.shape
    for i in range(numOfRun):
        myKNN(cv_train_data, cv_test_data, i+1)
    
    for j in np.linspace(0.0, 1.0, num=numOfRun):
        myMultinomialNB(cv_train_data, cv_test_data, j)
    
    for k in range(numOfRun):
        myLogisticRegression(cv_train_data, cv_test_data, 10**k * 1.0)
P2(train_data_title, test_data_title)

train_data.shape (1000L,)
cv_train_data.shape: (1000, 2070)
len(featureNames): 2070
test_data.shape (1000L,)
cv_test_data.shape (1000, 2070)
-------------using KNeighborsClassifier with k= 1
total: 1000  correct: 676  accuracy: 0.68
metrics.f1_score: 0.114754098361
-------------using KNeighborsClassifier with k= 2
total: 1000  correct: 745  accuracy: 0.74
metrics.f1_score: 0.0229885057471
-------------using KNeighborsClassifier with k= 3
total: 1000  correct: 707  accuracy: 0.71
metrics.f1_score: 0.0928792569659
-------------using MultinomialNB with alpha= 0.0
total: 1000  correct: 701  accuracy: 0.70
metrics.f1_score: 0.250626566416
-------------using MultinomialNB with alpha= 0.5
total: 1000  correct: 710  accuracy: 0.71
metrics.f1_score: 0.198895027624
-------------using MultinomialNB with alpha= 1.0
total: 1000  correct: 737  accuracy: 0.74
metrics.f1_score: 0.120401337793
-------------using LogisticRegression with penalty=l2 and C: 1.0
total: 1000  correct: 700  accuracy: 0.70
met

In [160]:
#So far MultinomialNB with alpha= 1.0 returns best result. Now we will combine both title and text'

In [171]:
def myMultinomialNB2(cv_train_data1, cv_test_data1, cv_train_data2, cv_test_data2, a = 1.0):
    print '-------------using MultinomialNB2 -2 data sets- with alpha=', a
    preds1 = myMultinomialNB(cv_train_data1, cv_test_data1, a)
    
    preds2 = myMultinomialNB(cv_train_data2, cv_test_data2, a)
  
    preds = np.zeros(preds1.shape)
    i=0
    for iPreds1, iPreds2 in zip(preds1, preds2):
        if (iPreds1 == 1) or (iPreds2 == 1) : preds[i] = 1
        i = i + 1
    preditStats(preds, test_labels)
    return preds

def processData(train_data, test_data):
    cv1 = CountVectorizer(input='content', analyzer = 'word', ngram_range=(1,1), preprocessor = better_preprocessor)
    cv_train_data = cv1.fit_transform(train_data)
    print 'train_data.shape', train_data.shape
    print 'cv_train_data.shape:', cv_train_data.shape
    
    #save feature names, which will be used later as vocabulary
    featureNames = cv1.get_feature_names()
    print 'len(featureNames):', len(featureNames)
    
    #use the same vocabulary to prepare test data
    cv2 = CountVectorizer(input='content', vocabulary=featureNames)
    cv_test_data = cv2.fit_transform(test_data)
    print 'test_data.shape', test_data.shape
    print 'cv_test_data.shape', cv_test_data.shape
    
    return cv_train_data, cv_test_data
    
def P3(train_data1, test_data1, train_data2, test_data2):
    
    cv_train_data1, cv_test_data1 = processData(train_data1, test_data1)
    cv_train_data2, cv_test_data2 = processData(train_data2, test_data2)
    
    myMultinomialNB2(cv_train_data1, cv_test_data1, cv_train_data2, cv_test_data2, 1.0)
  
P3(train_data_title, test_data_title, train_data_text, test_data_text)

train_data.shape (1000L,)
cv_train_data.shape: (1000, 2070)
len(featureNames): 2070
test_data.shape (1000L,)
cv_test_data.shape (1000, 2070)
train_data.shape (1000L,)
cv_train_data.shape: (1000, 6015)
len(featureNames): 6015
test_data.shape (1000L,)
cv_test_data.shape (1000, 6015)
-------------using MultinomialNB2 -2 data sets- with alpha= 1.0
-------------using MultinomialNB with alpha= 1.0
total: 1000  correct: 737  accuracy: 0.74
metrics.f1_score: 0.120401337793
-------------using MultinomialNB with alpha= 1.0
total: 1000  correct: 737  accuracy: 0.74
metrics.f1_score: 0.0899653979239
total: 1000  correct: 715  accuracy: 0.71
metrics.f1_score: 0.1642228739


In [None]:
#needs more work, this does not improve the result