In [13]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [14]:
# import the data
df_train = pd.read_json('train.json')
df_test = pd.read_json('test.json')


# drop the target column from the data and use it for the labels
classification_column_name = 'requester_received_pizza'

train_data = df_train.drop([classification_column_name], axis=1)
train_labels = df_train[classification_column_name]


# use twenty-five percent of the training data for a dev data set
# note that we cannot use the test data set here, because we are not given their labels
train_data, dev_data, train_labels, dev_labels = train_test_split(train_data, train_labels, random_state=42)

In [15]:
def decimal_to_percent(decimal):
    return round(decimal * 100, 2)

def vectorize():
    ''' Construct term-frequency matrices for use in models '''
    
    # use just the text of the post
    text_column = 'request_text'
    train_text = train_data[text_column]
    dev_text = dev_data[text_column]

    # construct the term-frequency count matrix
    tf_vect = CountVectorizer()
    tf = tf_vect.fit(train_text)
    
    # make the matrices global variables for convenience?
    global tf_train
    global tf_dev
    tf_train = tf.transform(train_text)
    tf_dev = tf.transform(dev_text)
    
vectorize()

In [16]:
def train_and_evaluate_default_model(model):
    ''' Train and score a model'''
    
    clf = model
    clf.fit(tf_train, train_labels)
    
    # return the accuracy and F1 scores
    accuracy = clf.score(tf_dev, dev_labels) 
    predicted = clf.predict(tf_dev)
    f1_score = metrics.f1_score(predicted, dev_labels, average=None)

    return accuracy, f1_score

def print_model_scores(model_type, accuracy, f1_score):
    ''' Print the accuracy and f1 scores '''
    
    print 'The accuracy of a default {} model is {}%\n'.format(model_type, decimal_to_percent(accuracy))
    print 'The F1 scores are:\nFalse: {}\nTrue: {}\n'.format(*[decimal_to_percent(score) for score in f1_score])

In [17]:
def logistic_regression():
    ''' Use a default logistic regression model to determine baseline performance '''
    
    return train_and_evaluate_default_model(LogisticRegression())

def naive_bayes():
    return train_and_evaluate_default_model(BernoulliNB())
    
def decision_tree():
    return train_and_evaluate_default_model(DecisionTreeClassifier())


for function, model_name in [(logistic_regression, 'Logistic Regression'), (naive_bayes, 'Naive Bayes'),
                  (decision_tree, 'Decision Tree')]:
    
    accuracy, f1_score = function()

    print_model_scores(model_name, accuracy, f1_score)

The accuracy of a default Logistic Regression model is 69.7%

The F1 scores are:
False: 81.02
True: 25.0

The accuracy of a default Naive Bayes model is 71.39%

The F1 scores are:
False: 82.08
True: 28.99

The accuracy of a default Decision Tree model is 64.36%

The F1 scores are:
False: 76.53
True: 25.93

