In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import pandas as pd
import time

# Trying Different Models

In [2]:
# Load the datasets
non_normalized_dataset = pd.read_csv('non-normalized data.csv')
normalized_dataset = pd.read_csv('normalized data.csv')

In [3]:
# Split into labels and data
normalized_labels = normalized_dataset.pop('Request Status').values
normalized_data = normalized_dataset.values

In [4]:
# Prepare the KFold Object.
kf = KFold(n_splits=10, shuffle=True)

In [5]:
def create_result_dict(name, model):
    """
      Creates a dictionary that can hold the results for the model

      Args:
        name: name of the model being classified
        model: model being classified
    """
    return {
        'name' : name,
        'model' : model,
        'training_times' : [],
        'average_training_time' : None,
        'testing_times' : [],
        'average_testing_time' : None,
        'accuracies' : [],
        'average_accuracy' : None
    }

In [6]:
# Adding each model I want to test with their parameters
models = [
    create_result_dict('Random Forest', RandomForestClassifier),
    create_result_dict('Perceptron', Perceptron),
    create_result_dict('Logistic Regression', LogisticRegression),
    create_result_dict('MultiLayer Perceptron', MLPClassifier)
]

In [7]:
# Train each model over each fold in the kfold and gather their performance
for train_index, test_index in kf.split(normalized_data):
    # For each model, create a new one and gather information about its performance
    for model in models:
        fresh_model = model['model']()
        # Train the model
        initial_train_time = time.time()
        fresh_model.fit(normalized_data[train_index], normalized_labels[train_index])
        model['training_times'].append(time.time() - initial_train_time)
        # Test the model
        initial_test_time = time.time()
        predicted_labels = fresh_model.predict(normalized_data[test_index])
        model['testing_times'].append(time.time() - initial_test_time)
        model['accuracies'].append(accuracy_score(normalized_labels[test_index], predicted_labels))

models

[{'name': 'Random Forest',
  'model': sklearn.ensemble._forest.RandomForestClassifier,
  'training_times': [5.921321392059326,
   5.7857208251953125,
   5.743280410766602,
   5.737278938293457,
   5.691267251968384,
   5.7722859382629395,
   5.802292585372925,
   5.746282577514648,
   5.810157775878906,
   5.841301679611206],
  'average_training_time': None,
  'testing_times': [0.08101701736450195,
   0.0800175666809082,
   0.07701683044433594,
   0.07901716232299805,
   0.08001852035522461,
   0.0780179500579834,
   0.0800180435180664,
   0.08301854133605957,
   0.07901763916015625,
   0.07901740074157715],
  'average_testing_time': None,
  'accuracies': [0.8921952492821718,
   0.8885408509527538,
   0.8898459932132603,
   0.8835813103628295,
   0.8885408509527538,
   0.8841033672670321,
   0.8825065274151436,
   0.8892950391644908,
   0.889556135770235,
   0.8900783289817232],
  'average_accuracy': None},
 {'name': 'Perceptron',
  'model': sklearn.linear_model._perceptron.Perceptron,

In [10]:
# Compute the averages
for model in models:
    model['average_training_time'] = sum(model['training_times'])/len(model['training_times'])
    model['average_testing_time'] = sum(model['testing_times'])/len(model['testing_times'])
    model['average_accuracy'] = sum(model['accuracies'])/len(model['accuracies'])
    print(model)

{'name': 'Random Forest', 'model': <class 'sklearn.ensemble._forest.RandomForestClassifier'>, 'training_times': [5.921321392059326, 5.7857208251953125, 5.743280410766602, 5.737278938293457, 5.691267251968384, 5.7722859382629395, 5.802292585372925, 5.746282577514648, 5.810157775878906, 5.841301679611206], 'average_training_time': 5.785118937492371, 'testing_times': [0.08101701736450195, 0.0800175666809082, 0.07701683044433594, 0.07901716232299805, 0.08001852035522461, 0.0780179500579834, 0.0800180435180664, 0.08301854133605957, 0.07901763916015625, 0.07901740074157715], 'average_testing_time': 0.07961766719818116, 'accuracies': [0.8921952492821718, 0.8885408509527538, 0.8898459932132603, 0.8835813103628295, 0.8885408509527538, 0.8841033672670321, 0.8825065274151436, 0.8892950391644908, 0.889556135770235, 0.8900783289817232], 'average_accuracy': 0.8878243653362394}
{'name': 'Perceptron', 'model': <class 'sklearn.linear_model._perceptron.Perceptron'>, 'training_times': [0.0140037536621093

Calculate Averages with Cross Validation, find times it takes to train and run each model with it as this is a time sensitive task. A prediction system that is 100% accurate but takes a year to predict is worthless when you have 10 minutes until the rider shoes up for the ride

Consider 3 things about model, train time, run time, and accuracy. why? Well presumably this dataset expands year after year, and they could either want to A: make a new model on last years data or B: train a model/update it over all current data. If this update is done every month, then a slow train time could mean the software is down for hours and not worthy. Accuracy is obvious, an inaccurate model is useless. train time seeabove

Note, the times for random forest are a bit skewed. Every other model requires scaling and normalizing, but random forest wouldn't need such things and thus could run faster. However, they all do need to changing and cleaning of data to only what is required.