In [84]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn import metrics

In [85]:
# import the data
df_train = pd.read_json('train.json')
df_test = pd.read_json('test.json')


# drop the target column from the data and use it for the labels
classification_column_name = 'requester_received_pizza'

train_data = df_train.drop([classification_column_name], axis=1)
train_labels = df_train[classification_column_name]


# use twenty-five percent of the training data for a dev data set
# note that we cannot use the test data set here, because we are not given their labels
train_data, dev_data, train_labels, dev_labels = train_test_split(train_data, train_labels, random_state=42)

In [124]:
def decimal_to_percent(decimal):
    return round(decimal * 100, 2)

def logistic_regression():
    ''' Use a default logistic regression model to determine baseline performance '''
    
    # use just the text of the post
    text_column = 'request_text'
    train_text = train_data[text_column]
    dev_text = dev_data[text_column]

    # construct the term-frequency count matrix
    tf_vect = CountVectorizer()
    tf = tf_vect.fit(train_text)
    tf_train = tf.transform(train_text)
    tf_dev = tf.transform(dev_text)

    # train a logistic regression model
    clf = LogisticRegression()
    clf.fit(tf_train, train_labels)

    # return the accuracy and F1 scores
    predicted = clf.predict(tf_dev)
    
    f1 = metrics.f1_score(predicted, dev_labels, average=None)

    return clf.score(tf_dev, dev_labels), f1 
    
accuracy, f1_score = logistic_regression() 

print 'The accuracy of a default logistic regression model is {}%'.format(decimal_to_percent(accuracy))
print
print 'The F1 scores are:\nFalse: {}\nTrue: {}'.format(*[decimal_to_percent(score) for score in f1_score])

The accuracy of a default logistic regression model is 69.7%

The F1 scores are:
False: 81.02
True: 25.0
