# Random Acts of Pizza Baseline

### Divyang Prateek, Brennan Borlaug, Cory Kind

###### Importing and structuring data

Start by importing relevant libraries for storing and analyzing data.

In [1]:
import pandas as pd
import json as js
import random
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import *
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn import preprocessing

Read in JSON file of training data.

In [2]:
#Reads the json file as a String
data2 = open("train.json").read()
#Converts JSON string to a List of Dictionaries
jsondata2 = js.loads(data2)

The RAOP data contains a variety of predictors of different formats. This step puts variables into separate categories for text and numeric, and creates an array for the outcome we are trying to predict ("requester_received_pizza"). We decided it was easier to work with text and numeric variables separately at this stage.


NOTE that the following variables are not currently imported because they require extra processing. They will be addressed at a later point, but are not required for the baseline.

1) requester_subreddits_at_request (returns an array)

2) unix timestamp of request (date format)

In [3]:
#numeric variables
numeric_variables = ['number_of_downvotes_of_request_at_retrieval',
    'number_of_upvotes_of_request_at_retrieval',
    'post_was_edited',
    'request_number_of_comments_at_retrieval',
    'requester_account_age_in_days_at_request',
    'requester_account_age_in_days_at_request',
    'requester_account_age_in_days_at_retrieval',
    'requester_days_since_first_post_on_raop_at_request',
    'requester_days_since_first_post_on_raop_at_retrieval',
    'requester_number_of_comments_at_request',
    'requester_number_of_comments_at_retrieval',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_comments_in_raop_at_retrieval',
    'requester_number_of_posts_at_request',
    'requester_number_of_posts_at_retrieval',
    'requester_number_of_posts_on_raop_at_request',
    'requester_number_of_posts_on_raop_at_retrieval',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_minus_downvotes_at_retrieval',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_retrieval',
    'unix_timestamp_of_request_utc']

#requester variables (from time of request)
requester_variables = ['requester_account_age_in_days_at_request',
                      'requester_days_since_first_post_on_raop_at_request',
                      'requester_number_of_comments_at_request',
                      'requester_number_of_comments_in_raop_at_request',
                      'requester_number_of_posts_at_request',
                      'requester_number_of_posts_on_raop_at_request',
                      'requester_number_of_subreddits_at_request',
                      'requester_upvotes_minus_downvotes_at_request',
                      'requester_upvotes_plus_downvotes_at_request']


#text variables
text_variables = ['giver_username_if_known',
    'request_id',
    'request_text',
    'request_text_edit_aware',
    'request_title',
    'requester_user_flair',
    'requester_username']

#Creating empty data frames to store the training data
numeric_elements = pd.DataFrame(np.nan, index = range(len(jsondata2)), columns = numeric_variables)
text_elements = pd.DataFrame(np.nan, index = range(len(jsondata2)), columns = text_variables)
requester_elements = pd.DataFrame(np.nan, index = range(len(jsondata2)), columns = requester_variables)
outcome = pd.DataFrame(np.nan, index = range(len(jsondata2)), columns = ['requester_received_pizza'])

#Print the number of text and numeric predictors currently included
print "Number of numeric variables: ", len(numeric_elements.columns)
print "Number of text variables: ", len(text_elements.columns)

Number of numeric variables:  23
Number of text variables:  7


The next step is to fill these arrays from the JSON data. Although the loop approach is less efficient at large scale, we went this direction because the number of keys varies between cases in the data.

In [4]:
for i in range(len(jsondata2)):
    mykeys = jsondata2[i].keys()
    myvals = jsondata2[i].values()
    for key, val in zip(mykeys, myvals):
        if key in numeric_variables:
            idx = numeric_variables.index(key)
            numeric_elements.iloc[i, idx] = val
        if key in requester_variables:
            idx = requester_variables.index(key)
            requester_elements.iloc[i, idx] = val
        if key in text_variables:
            idx = text_variables.index(key)
            text_elements.iloc[i, idx] = val
        if key == 'requester_received_pizza':
            outcome.iloc[i,0] = val


This is a quick check on the size of these arrays - the number of columns should match the number of text and numeric predictors determined above.

In [8]:
#Output shapes of numeric, text, and outcome arrays
print "Numeric array:"
print numeric_elements.shape
print 

print "Requester array:"
print requester_elements.shape
print

print "Text array:"
print text_elements.shape
print

print "Outcome array:"
print outcome.shape
print

Numeric array:
(4040, 23)

Requester array:
(4040, 9)

Text array:
(4040, 7)

Outcome array:
(4040, 1)



Here we split out a dev set from the provided training data (80/20). There is no need to separate out a test set, since that is provided by Kaggle in a separate JSON file. To compare our results to other competitors in the Kaggle competition, we will need to use that test set.

In [9]:
random.seed(500)
data_size = len(jsondata2)
dev_indices = random.sample(range(data_size), data_size / 5)
train_indices = list(set(range(data_size)) - set(dev_indices))

#Define training & dev sets
train_requester_feats = requester_elements.ix[train_indices,]
train_outcomes = outcome.ix[train_indices,].astype(int).sum(axis = 1)
dev_requester_feats = requester_elements.ix[dev_indices,]
dev_outcomes = outcome.ix[dev_indices,].astype(int).sum(axis = 1)

print "Number of training cases: ", len(train_indices)
print "Number of dev cases: ", len(dev_indices)

Number of training cases:  3232
Number of dev cases:  808


###### Creating features for the baseline model

Now that the set-up is over, we can start using the text of the request to extract more interesting predictors. As a baseline, we're going to build a logistic regression model based on the word counts from the request text alone.

In [None]:
#Pull out the request text and outcomes for training and dev sets
train_request_text = text_elements.ix[train_indices, "request_text"]
dev_request_text = text_elements.ix[dev_indices, "request_text"]

train_outcome = outcome.ix[train_indices,].astype(int).sum(axis = 1)
dev_outcome = outcome.ix[dev_indices,].astype(int).sum(axis = 1)

In [None]:
#Create CountVectorizer object with no preprocessing, but include basic English stop words
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = "english")
train_data_features = vectorizer.fit_transform(train_request_text)
train_vocab = vectorizer.get_feature_names()

#Use train_vocab to extract the same features from the dev set
vectorizer_dev = CountVectorizer(analyzer = "word", tokenizer= None, preprocessor = None, stop_words = "english", vocabulary = train_vocab)
dev_data_features = vectorizer_dev.fit_transform(dev_request_text)

print "The length of the vocabulary using this basic model is: ", str(len(train_vocab))

###### Fitting a logistic regression model and printing confusion matrix and classification report

In [None]:
#Fit L2 Logistic Regression model
log_regression = LogisticRegression(penalty = "l2", C = 1)
log_regression.fit(train_data_features, train_outcome)
dev_predicted_labels = log_regression.predict(dev_data_features)

#Print confusion matrix and classification report
print "Confusion matrix on dev data for Logistic Regression model no processing, using only request_text: "
print metrics.confusion_matrix(dev_outcome, dev_predicted_labels, labels = [0,1])
print

print "Classification report: "
print metrics.classification_report(dev_outcome, dev_predicted_labels, labels = [0, 1])

We are pretty happy with this as a first attempt. There is a lot of room for growth, but even with our very basic model we're seeing decent initial results.

###### Directions for future analysis

As a quick check of our initial model, we pulled out the 20 unigrams with the largest weights. This can also help us think through what patterns to look for in the data.

In [None]:
#Extracting 20 largest weights from the logistic regression model and printing
weights = log_regression.coef_
top_weights = np.argpartition(weights[0,], -19)[-20:]

#Printing out features
print "Unigram Features with Largest Weights"
for j in top_weights:
    print str(train_vocab[j])

These words seem directionally correct based on our early reviews of the request_text variable and the kind of requests people make. "Rice" is often presented as an alternative to pizza (i.e., "I've been eating rice for a week - pizza would be a nice change"). A second note is that "op" is part of the Reddit lexicon. One hypothesis is that requesters who come across as insiders are more likely to get pizza. To test for that, we can include measures like # of subreddits and length of Reddit history. A couple of other ideas we have include:

- Time (seasonality, day of week, are people more likely to give at certain times of the month)
- Text (extracting predictors from request titles, include bigrams/triagrams, # of spelling errors, potentially sentiment analysis)
- Reddit behaviors (number of sub-reddits, length of time on Reddit, upvote/downvote differential)

### Baseline Model

Before we begin to develop a predictive model, we must establish a baseline to give ourselves a sense of what we are trying to achieve. In our application, the following models would serve as an effective baseline: 
    1. A model that always predicts the most frequent label (in this case, no pizza).
    2. A model that predicts outcomes with the probability of the mean response (e.g. 26.6% of requesters receive a pizza, therefore the model will predict positive outcomes 24.6% of the time).

In [None]:
y = 0
r = 0
outcomes = []
for request in jsondata2:
    if request['requester_received_pizza'] == True:
        y+=1
        r+=1
        outcomes.append(1)
    else:
        r+=1
        outcomes.append(0)
avg = float(y)/float(r)

#Baseline 1
base1 = [0]*len(jsondata2)
c = 0
n = 0
for i, j in zip(base1, outcomes):
    if i == j:
        c+=1
        n+=1
    else:
        n+=1
print 'Baseline 1 Accuracy:', round(float(c)/float(n),4)*100, '%'

#Baseline 2
base2 = np.random.binomial(1, avg, size=len(jsondata2))
c = 0
n = 0
for i, j in zip(base2, outcomes):
    if i == j:
        c+=1
        n+=1
    else:
        n+=1        
print 'Baseline 2 Accuracy:', round(float(c)/float(n),4)*100, '%'

### Logistic Regression

In [None]:
#Simple logistic regression model
clf = LogisticRegression(penalty = "l2")
clf.fit(train_requester_feats, train_outcomes)
preds = clf.predict(dev_requester_feats)
probs = clf.predict_proba(dev_requester_feats)

pred_probs=[]
for prob in probs:
    pred_probs.append(max(prob[0], prob[1]))

#print pd.DataFrame(zip(preds,pred_probs))

print 'Simple Regression Model Accuracy:', round(metrics.accuracy_score(dev_outcomes, preds),4)*100, '%','\n'
print 'Confusion Matrix:'
print metrics.confusion_matrix(dev_outcomes, preds), '\n'
print 'Classification Report:'
print metrics.classification_report(dev_outcomes, preds)

In [None]:
#Examine the coefficients
print pd.DataFrame(zip(requester_variables, np.transpose(clf.coef_)), 
                   columns=['features', 'coefs']).sort_values('coefs', ascending = False)

In [None]:
#Tune hyperparameter C
params = np.arange(0.05, 5, 0.05)
acc = []

for c in params:
    clf = LogisticRegression(penalty = "l2", C=c)
    clf.fit(train_requester_feats, train_outcomes)
    preds = clf.predict(dev_requester_feats)
    acc.append(metrics.accuracy_score(dev_outcomes, preds))

optimal_c = params[acc.index(max(acc))]
print 'Logistic Regression:'
print 'optimal c: ', optimal_c, '; accuracy: ', round(max(acc), 4)*100, '%'

Well that wasn't very enlightening. I'm pretty sure C defaults to 1.0...

Now I'll see if it helps to normalize the features (so they are on the same scale - Z-scores).

In [None]:
#Create new dataframe to store scaled training features
norm_train_feats = pd.DataFrame()

norm_train_feats['requester_account_age_in_days_at_request'] = preprocessing.scale(train_requester_feats['requester_account_age_in_days_at_request'])
norm_train_feats['requester_days_since_first_post_on_raop_at_request'] = preprocessing.scale(train_requester_feats['requester_days_since_first_post_on_raop_at_request'])
norm_train_feats['requester_number_of_comments_at_request'] = preprocessing.scale(train_requester_feats['requester_number_of_comments_at_request'])
norm_train_feats['requester_number_of_comments_in_raop_at_request'] = preprocessing.scale(train_requester_feats['requester_number_of_comments_in_raop_at_request'])                      
norm_train_feats['requester_number_of_posts_at_request'] = preprocessing.scale(train_requester_feats['requester_number_of_posts_at_request'])    
norm_train_feats['requester_number_of_posts_on_raop_at_request'] = preprocessing.scale(train_requester_feats['requester_number_of_posts_on_raop_at_request'])                      
norm_train_feats['requester_number_of_subreddits_at_request'] = preprocessing.scale(train_requester_feats['requester_number_of_subreddits_at_request'])    
norm_train_feats['requester_upvotes_minus_downvotes_at_request'] = preprocessing.scale(train_requester_feats['requester_upvotes_minus_downvotes_at_request'])
norm_train_feats['requester_upvotes_plus_downvotes_at_request'] = preprocessing.scale(train_requester_feats['requester_upvotes_plus_downvotes_at_request'])                     

#Create dataframe for scaled dev features
norm_dev_feats = pd.DataFrame()

norm_dev_feats['requester_account_age_in_days_at_request'] = preprocessing.scale(dev_requester_feats['requester_account_age_in_days_at_request'])
norm_dev_feats['requester_days_since_first_post_on_raop_at_request'] = preprocessing.scale(dev_requester_feats['requester_days_since_first_post_on_raop_at_request'])
norm_dev_feats['requester_number_of_comments_at_request'] = preprocessing.scale(dev_requester_feats['requester_number_of_comments_at_request'])
norm_dev_feats['requester_number_of_comments_in_raop_at_request'] = preprocessing.scale(dev_requester_feats['requester_number_of_comments_in_raop_at_request'])                      
norm_dev_feats['requester_number_of_posts_at_request'] = preprocessing.scale(dev_requester_feats['requester_number_of_posts_at_request'])    
norm_dev_feats['requester_number_of_posts_on_raop_at_request'] = preprocessing.scale(dev_requester_feats['requester_number_of_posts_on_raop_at_request'])                      
norm_dev_feats['requester_number_of_subreddits_at_request'] = preprocessing.scale(dev_requester_feats['requester_number_of_subreddits_at_request'])    
norm_dev_feats['requester_upvotes_minus_downvotes_at_request'] = preprocessing.scale(dev_requester_feats['requester_upvotes_minus_downvotes_at_request'])
norm_dev_feats['requester_upvotes_plus_downvotes_at_request'] = preprocessing.scale(dev_requester_feats['requester_upvotes_plus_downvotes_at_request'])                     

#Create logistic regression model with normalized features
norm_clf = LogisticRegression(penalty = "l2")
norm_clf.fit(norm_train_feats, train_outcomes)
norm_preds = norm_clf.predict(norm_dev_feats)

print 'Normalized Regression Model Accuracy:', round(metrics.accuracy_score(dev_outcomes, norm_preds),4)*100, '%'


Normalizing the features did not yield an improvement to the model but it allows me to better visualize which features are having the greatest predictive power:

In [None]:
print pd.DataFrame(zip(requester_variables, np.transpose(norm_clf.coef_)), 
                   columns=['features', 'coefs']).sort_values('coefs', ascending = False)

### Nearest Neighbors

In [None]:
knn = KNeighborsClassifier(algorithm='auto', n_neighbors=11) #tested 1,3,5,7,9,11,13,& 15 neighbors to select best parameter
knn.fit(train_requester_feats, train_outcomes)
knn_preds = knn.predict(dev_requester_feats)

norm_knn = KNeighborsClassifier()
norm_knn.fit(norm_train_feats, train_outcomes)
norm_knn_preds = norm_knn.predict(norm_dev_feats)

print 'Nearest Neighbors Model Accuracy:', round(metrics.accuracy_score(dev_outcomes, knn_preds),4)*100, '%'
print 'Nearest Neighbors (Norm. Feats) Model Accuracy:', round(metrics.accuracy_score(dev_outcomes, norm_knn_preds),4)*100, '%'


### Nearest Centroid

The Nearest Centroid classifier is a simple algorithm that represents each class by the centroid of its members. A new set of features is classified based on its distance from the centroids of the features in the training set.

In [None]:
nc = NearestCentroid()
nc.fit(train_requester_feats, train_outcomes)
nc_preds = nc.predict(dev_requester_feats)

norm_nc = NearestCentroid()
norm_nc.fit(norm_train_feats, train_outcomes)
norm_nc_preds = norm_nc.predict(norm_dev_feats)

print 'Nearest Centroid Model Accuracy:', round(metrics.accuracy_score(dev_outcomes, nc_preds),4)*100, '%'
print 'Nearest Centroid (Norm. Feats) Model Accuracy:', round(metrics.accuracy_score(dev_outcomes, norm_nc_preds),4)*100, '%'

### Random Forests

In [None]:
estimators=[]
accuracies=[]

for i in range(1,30):
    rf = RandomForestClassifier(n_estimators=i, random_state=99)
    rf.fit(train_requester_feats, train_outcomes)
    rf_preds=rf.predict(dev_requester_feats)
    acc = metrics.accuracy_score(dev_outcomes, rf_preds)
    estimators.append(i)
    accuracies.append(acc)

max_acc = max(accuracies)
est = estimators[accuracies.index(max_acc)]
print 'Random Forests Model Accuracy (', est, 'estimators ):', round(max_acc,4)*100, '%'

### Ensemble Learner
Lastly, I will test a model that takes the most frequent prediction from the logistic regression, nearest neighbors (n=11), & random forest (n_estimators=26) models outlined above as its prediction.

In [None]:
#Logistic Regression
reg = LogisticRegression(penalty="l2", C=1.0)
reg.fit(train_requester_feats, train_outcomes)
reg_preds = clf.predict(dev_requester_feats)

#Nearest Neighbors
knn = KNeighborsClassifier(algorithm='auto', n_neighbors=11)
knn.fit(train_requester_feats, train_outcomes)
knn_preds = knn.predict(dev_requester_feats)

#Random Forests
rf = RandomForestClassifier(n_estimators=26, random_state=99)
rf.fit(train_requester_feats, train_outcomes)
rf_preds=rf.predict(dev_requester_feats)

ensemble_preds=[]

for i,j,k in zip(reg_preds, knn_preds, rf_preds):
    pred=[]
    pred.append(i), pred.append(j), pred.append(k)
    ensemble_preds.append(max(set(pred)))
    
print 'Ensemble Learner Model Accuracy:', round(metrics.accuracy_score(dev_outcomes, ensemble_preds),4)*100, '%'


To summarize, here are the models, ranked in terms of accuracy, for requester features: 
  
1) Logistic regression (non-normalized feats) - 75.12%  
2) Random forests - 73.64%  
3) Nearest neighbors - 73.02%  
4) Ensemble learner - 72.77%  
5) Nearest centroid - 67.7%  
  
All 5 models were better than guessing based on the outcome distribution (63.37%) but were not better than guessing the most common outcome (75.4%).

In [15]:
#Extracting the numerical data and converting to datetime.
train_request_time = numeric_elements.ix[train_indices,"unix_timestamp_of_request_utc"].astype(long)
train_request_dateTime = [datetime.datetime.fromtimestamp(time) for time in train_request_time]
dev_request_time = numeric_elements.ix[dev_indices,"unix_timestamp_of_request_utc"].astype(long)
dev_request_dateTime = [datetime.datetime.fromtimestamp(time) for time in dev_request_time]

In [18]:
train_month = np.asarray([time.month for time in train_request_dateTime]).reshape((len(train_request_time),1))
train_month_label = np.asarray(train_outcomes)

dev_month = np.asarray([time.month for time in dev_request_dateTime]).reshape((len(dev_request_time),1))
dev_month_label = np.asarray(dev_outcomes)

knn_clf  = KNeighborsClassifier()
knn_clf = knn_clf.fit(train_month,train_month_label)
print 'KNN Score :',knn_clf.score(dev_month,dev_month_label)

lr_clf = LogisticRegression(C=1)
lr_clf = lr_clf.fit(train_month,train_month_label)
print 'Logistic Regression Score :',lr_clf.score(dev_month,dev_month_label)


KNN Score : 0.706683168317
Logistic Regression Score : 0.738861386139


In [20]:
train_holiday_season = np.asarray([time.month >= 10 for time in train_request_dateTime]).reshape((len(train_request_time),1))
train_holiday_label = np.asarray(train_outcomes)

dev_holiday_season = np.asarray([time.month >= 10 for time in dev_request_dateTime]).reshape((len(dev_request_time),1))
dev_holiday_label = np.asarray(dev_outcomes)

knn_clf  = KNeighborsClassifier()
knn_clf = knn_clf.fit(train_holiday_season,train_holiday_label)
print 'KNN Score :',knn_clf.score(dev_holiday_season,dev_holiday_label)

lr_clf = LogisticRegression(C=1)
lr_clf = lr_clf.fit(train_holiday_season,train_holiday_label)
print 'Logistic Regression Score :',lr_clf.score(dev_holiday_season,dev_holiday_label)



KNN Score : 0.738861386139
Logistic Regression Score : 0.738861386139


In [22]:
train_payday_effect = np.asarray([(time.day > 26 or time.day<2) for time in train_request_dateTime]).reshape((len(train_request_time),1))
train_payday_label = np.asarray(train_outcomes)

dev_payday_effect = np.asarray([(time.day > 26 or time.day<2) for time in dev_request_dateTime]).reshape((len(dev_request_time),1))
dev_payday_label = np.asarray(dev_outcomes)

knn_clf_pe  = KNeighborsClassifier()
knn_clf_pe = knn_clf_pe.fit(train_payday_effect,train_payday_label)
print 'KNN Score :',knn_clf.score(dev_payday_effect,dev_payday_label)

lr_clf = LogisticRegression(C=1)
lr_clf = lr_clf.fit(train_payday_effect,train_payday_label)
print 'Logistic Regression Score :',lr_clf.score(dev_payday_effect,dev_payday_label)

print train_payday_effect.shape
    

KNN Score : 0.738861386139
Logistic Regression Score : 0.738861386139
(3232, 1)


In [23]:
def evaluate_features(train_data,train_labels, dev_data,dev_labels):
    #Nearest Neighbour Classifier\
    knn_scores=[]
    knn_estimators=[]
    for k in range(1,16):
        knn = KNeighborsClassifier(algorithm='auto', n_neighbors=k) #tested 1,3,5,7,9,11,13,& 15 neighbors to select best parameter
        knn.fit(train_data, train_labels)
        knn_preds = knn.predict(dev_data)
        knn_scores.append(round(metrics.accuracy_score(dev_outcomes, knn_preds),4)*100)
        knn_estimators.append(k)
    
    print 'Nearest Neighbors Model Accuracy:', max(knn_scores), '%', ' with n_neighbours:',knn_estimators[knn_scores.index(max(knn_scores))]
    
    #Nearest Centroid Classifier
    nc = NearestCentroid()
    nc.fit(train_data, train_labels)
    nc_preds = nc.predict(dev_data)
    
    print 'Nearest Centroid Model Accuracy:', round(metrics.accuracy_score(dev_labels, nc_preds),4)*100, '%'
    
    #Logistic Regression
    reg = LogisticRegression(penalty="l2", C=1.0)
    reg.fit(train_data, train_labels)
    reg_preds = reg.predict(dev_data)
    
    print 'Logistic Regression Model Accuracy:', round(metrics.accuracy_score(dev_labels, reg_preds),4)*100, '%'

    estimators=[]
    accuracies=[]

    for i in range(1,30):
        rf = RandomForestClassifier(n_estimators=i, random_state=99)
        rf.fit(train_requester_feats, train_outcomes)
        rf_preds=rf.predict(dev_requester_feats)
        acc = metrics.accuracy_score(dev_outcomes, rf_preds)
        estimators.append(i)
        accuracies.append(acc)

    max_acc = max(accuracies)
    est = estimators[accuracies.index(max_acc)]
    print 'Random Forests Model Accuracy (', est, 'estimators ):', round(max_acc,4)*100, '%'
    
    

In [25]:
print train_requester_feats.shape
evaluate_features(train_requester_feats,train_outcomes,dev_requester_feats,dev_outcomes)
train_requester_feats['unix_timestamp_of_request_utc'] = train_month.astype('float64')
dev_requester_feats['unix_timestamp_of_request_utc'] = dev_month.astype('float64')
evaluate_features(train_requester_feats,train_outcomes,dev_requester_feats,dev_outcomes)

(3232, 10)
Nearest Neighbors Model Accuracy: 73.51 %  with n_neighbours: 10
Nearest Centroid Model Accuracy: 67.7 %
Logistic Regression Model Accuracy: 74.75 %
Random Forests Model Accuracy ( 28 estimators ): 72.9 %
Nearest Neighbors Model Accuracy: 73.51 %  with n_neighbours: 10
Nearest Centroid Model Accuracy: 67.7 %
Logistic Regression Model Accuracy: 74.75 %
Random Forests Model Accuracy ( 28 estimators ): 72.9 %
