This notebook trains a random forest model and also several different decision tree models.

In [2]:
import numpy as np
import pandas as pd
import zipfile
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier 

In [None]:
# Unzip data files into the "csv" subdirectory 
# (unless this has already done this since running the Data Set Up notebook)

# **IMPORTANT**  This will overwrite existing files in the "csv" folder in your local repo
# with the most recent data files from the data.zip file

# Unzip 80% training data
unzip_training_data = zipfile.ZipFile("data_subset.zip", "r")
unzip_training_data.extractall()
unzip_training_data.close()

# Unzip development and training data
unzip_test_data = zipfile.ZipFile("testing.zip", "r")
unzip_test_data.extractall()
unzip_test_data.close()

# Unzip full set of training data for creating predictions to submit to Kaggle
unzip_all_data = zipfile.ZipFile("data.zip", "r")
unzip_all_data.extractall()
unzip_all_data.close()

In [3]:
# Load these csv files into numpy arrays for testing on development data
train_data = np.loadtxt('csv/train_data.csv', delimiter=",")
train_labels = np.loadtxt('csv/train_labels.csv', dtype=str, delimiter=",")
dev_data = np.loadtxt('csv/dev_data.csv', delimiter=",")
dev_labels = np.loadtxt('csv/dev_labels.csv', dtype=str, delimiter=",")

In [3]:
# Load these csv files into numpy arrays for creating predictions to submit to Kaggle
train_data_all = np.loadtxt('csv/train_data_all.csv', delimiter=",")
train_labels_all = np.loadtxt('csv/train_labels_all.csv', dtype=str, delimiter=",")
test_data_all = np.loadtxt('csv/test_data_all.csv', delimiter=",")

In [4]:
# print shapes to compare before and after csv conversion
print("train_data shape is", train_data.shape)
print("train_labels shape is", train_labels.shape)
print("dev_data shape is", dev_data.shape)
print("dev_labels shape is", dev_labels.shape)

train_data shape is (702439, 58)
train_labels shape is (702439,)
dev_data shape is (175610, 58)
dev_labels shape is (175610,)


In [5]:
print("train_data_all shape is", train_data_all.shape)
print("train_labels_all shape is", train_labels_all.shape)
print("test_data_all shape is", test_data_all.shape)

train_data_all shape is (878049, 58)
train_labels_all shape is (878049,)
test_data_all shape is (884262, 58)


In [None]:
# IF there are additional changes to make to the data for this model
# that would be easier to do in pandas, uncomment and run this code. 
# This model works the same whether the data is in numpy or pandas, so presumably so do other models

#train_data = pd.DataFrame(train_data)
#train_labels = pd.DataFrame(train_labels)
#dev_data = pd.DataFrame(dev_data)
#dev_labels = pd.DataFrame(dev_labels)
#train_data_all = pd.DataFrame(train_data_all)
#train_labels_all = pd.DataFrame(train_labels_all)
#test_data_all = pd.DataFrame(test_data_all)

## Random Forest

In [7]:
# Set up functions for training random forest and finding optimal hyperparameters

def TrainRF(data, labels, test_data, depth, n=10):
    """This function takes in training data and labels, testing data,
    and can accept different values of n (the number of random decision trees to create),
    and can also accept different values of max_depth (which can also dramatically influence performance).
    
    It trains a random forest model and returns the model and predicted probabilities.
    """
    RF = RandomForestClassifier(n_estimators=n, max_depth=depth, n_jobs=1)
    RF.fit(data, labels)
    pp = RF.predict_proba(test_data)
    return RF, pp

def find_max_depth(data, labels, dev_data, dev_labels, depth_values):
    """Find optimal value of max_depth in a random forest model.  
    
    Note that this cannot be used on test data from Kaggle 
    because we do not have labels for that data.  This function is intended to only be used
    in the development stage with the development data.    
    """
    for d in depth_values:      
        RF, pp = TrainRF(data, labels, dev_data, d)
        logloss = metrics.log_loss(dev_labels, pp)
        
        # Print log loss for each value
        print("For max_depth =", d, "the Log Loss score is", round(logloss, 6))  

def find_n(data, labels, dev_data, dev_labels, depth, n_values):
    """Find optimal value of n in a random forest model.  
    
    Note that this cannot be used on test data from Kaggle 
    because we do not have labels for that data.  This function is intended to only be used
    in the development stage with the development data.
    """
    for n in n_values:      
        RF, pp = TrainRF(data, labels, dev_data, depth, n)
        logloss = metrics.log_loss(dev_labels, pp)
        
        # Print log loss for each value
        print("For n =", n, "the Log Loss score is", round(logloss, 6))

In [44]:
# Try out some different values of max_depth
values = [1, 3, 5, 7, 9, 15, 17, 19, 21]
find_max_depth(train_data, train_labels, dev_data, dev_labels, values)

For max_depth = 1 the Log Loss score is 2.651556
For max_depth = 3 the Log Loss score is 2.61752
For max_depth = 5 the Log Loss score is 2.578207
For max_depth = 7 the Log Loss score is 2.546061
For max_depth = 9 the Log Loss score is 2.511507
For max_depth = 15 the Log Loss score is 2.437328
For max_depth = 17 the Log Loss score is 2.447785
For max_depth = 19 the Log Loss score is 2.465811
For max_depth = 21 the Log Loss score is 2.55271




In [45]:
# Try out some more precise values of max_depth
values = [14, 15, 16, 17, 18]
find_max_depth(train_data, train_labels, dev_data, dev_labels, values)

For max_depth = 14 the Log Loss score is 2.450646
For max_depth = 15 the Log Loss score is 2.44434
For max_depth = 16 the Log Loss score is 2.439742
For max_depth = 17 the Log Loss score is 2.446666
For max_depth = 18 the Log Loss score is 2.449507




Result re: max depth:

Log loss varies a bit with each time the model is trained.  
The best log loss is with max_depth set to 15, 16, or 17, so I chose 15 to keep the max_depth a bit smaller to improve speed of training the models.

In [52]:
depth = 15

In [63]:
# Find the optimal value of n using the optimal max depth and 
# the 80% training data and the development data

# NOTE: With the default max_depth of None and n_jobs = -1 (using all cores), higher values of n crashed my laptop
# Apparently this is common with random forests
# The fix I used is to limit the max_depth and limit n_jobs to 1 core (n_jobs=1 is the default)  --Laura

n_values = [10, 50, 100, 200, 300, 500, 750, 1000, 1500]
find_n(train_data, train_labels, dev_data, dev_labels, depth, n_values)

For n = 10 the Log Loss score is 2.438711
For n = 50 the Log Loss score is 2.412623
For n = 100 the Log Loss score is 2.410666
For n = 200 the Log Loss score is 2.406928
For n = 300 the Log Loss score is 2.406008
For n = 500 the Log Loss score is 2.405178
For n = 750 the Log Loss score is 2.405141
For n = 1000 the Log Loss score is 2.404154
For n = 1500 the Log Loss score is 2.403657




In [60]:
# Repeat higher values of n
n_values = [500]
find_n(train_data, train_labels, dev_data, dev_labels, depth, n_values)

For n = 500 the Log Loss score is 2.403892




In [61]:
n_values = [1000]
find_n(train_data, train_labels, dev_data, dev_labels, depth, n_values)

For n = 1000 the Log Loss score is 2.404119




In [62]:
n_values = [2000]
find_n(train_data, train_labels, dev_data, dev_labels, depth, n_values)

For n = 2000 the Log Loss score is 2.403935




Result re: n values:

Use n=50 or n=100 for testing hyperparameters for quick processing.  
Use at least n=500 for best log loss score.  
Results vary each time the model is run (because of randomness), so no single n value will always return the best score.  
Higher values are usually better, but anything over about n=500 or n=1000 will be close to the best it can get

#### Random Forest results
Using max_depth of 15 and the highest n value practical (i.e., that can be run in a reasonable amount of time returns a better log loss score thus far than either KNN or logistic regression. 

In [8]:
# Train model with single n value and single max_depth value on full set of training data
depth = 15
n = 1000
RF, pp = TrainRF(train_data_all, train_labels_all, test_data_all, depth, n)

## Boosted Decision Tree

In [36]:
# Set up functions for boosted trees and finding optimal hyperparameters

def BoostedTree(data, labels, test_data, learn=1.0, n=10, depth=1):
    """This function takes in training data and labels, testing data,
    and can accept different values of n (number of estimators), 
    max_depth of the decision tree
    and can also accept different values of learning rate for the booster
    
    It trains a Boosted Decision Tree and returns the model and predicted probabilities.
    """
    Boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=depth), 
                               n_estimators=n, 
                               learning_rate=learn)
    Boost.fit(data, labels)
    pp = Boost.predict_proba(test_data)
    return Boost, pp

def find_learning_rate(data, labels, dev_data, dev_labels, learn_values):
    """Find optimal learning rate in an AdaBoost model on a decision tree. 
    
    Note that this cannot be used on test data from Kaggle 
    because we do not have labels for that data.  This function is intended to only be used
    in the development stage with the development data.    
    """
    for learn in learn_values:      
        Boost, pp = BoostedTree(data, labels, dev_data, learn)
        logloss = metrics.log_loss(dev_labels, pp)
        
        # Print log loss for each value of k
        print("For learning rate =", learn, "the Log Loss score is", round(logloss, 6))  
        
def find_n_learn(data, labels, dev_data, dev_labels, learn_values, n_values):
    """There is a tradeoff in boosting between learning rate and n_estimators.
    This function looks for the optimal combination of learning rate and n_estimators.
    """
    for learn in learn_values:
        for n in n_values:
            Boost, pp = BoostedTree(data, labels, dev_data, learn, n)
            logloss = metrics.log_loss(dev_labels, pp)        
            # Print log loss for each combined value of learning rate and n_estimators:
            print("For learning rate =", learn, "and n value", n, "the Log Loss score is", round(logloss, 6))  
        
def find_max_depth(data, labels, dev_data, dev_labels, learn, n, depth_values):
    """Find optimal value of max_depth in the boosted model 
    given optimal combination of learning rate and n_estimators    
    """
    for d in depth_values:      
        BT, pp = BoostedTree(data, labels, dev_data, learn, n, d)
        logloss = metrics.log_loss(dev_labels, pp)        
        # Print log loss for each value
        print("For max_depth =", d, "the Log Loss score is", round(logloss, 6))         


In [24]:
# First train a boosted decision tree model with default values
Boost, pp = BoostedTree(train_data, train_labels, dev_data)
logloss = metrics.log_loss(dev_labels, pp)
print(logloss)

3.39454132531


In [23]:
# Find the optimal learning rate for default value of n=10
learn_values = [.0001, .001, .01, .05, .1, .5, 1.0]
find_learning_rate(train_data, train_labels, dev_data, dev_labels, learn_values)

For learning rate = 0.0001 the Log Loss score is 2.647279
For learning rate = 0.001 the Log Loss score is 2.647272
For learning rate = 0.01 the Log Loss score is 2.644091
For learning rate = 0.05 the Log Loss score is 2.658273
For learning rate = 0.1 the Log Loss score is 2.712374
For learning rate = 0.5 the Log Loss score is 3.183055
For learning rate = 1 the Log Loss score is 3.394541


In [40]:
# There is a trade off between learning rate and values of n, while max_depth=1
# so iterate through learning rate with values of n higher than the default of n
n_values = [50, 100, 250]
learn_values = [.0001, .001, .01, .05, .1]
find_n_learn(train_data, train_labels, dev_data, dev_labels, learn_values, n_values)

For learning rate = 0.0001 and n value 50 the Log Loss score is 2.647274
For learning rate = 0.0001 and n value 100 the Log Loss score is 2.647273
For learning rate = 0.0001 and n value 250 the Log Loss score is 2.647309
For learning rate = 0.001 and n value 50 the Log Loss score is 2.647485
For learning rate = 0.001 and n value 100 the Log Loss score is 2.643299
For learning rate = 0.001 and n value 250 the Log Loss score is 2.644949
For learning rate = 0.01 and n value 50 the Log Loss score is 2.662126
For learning rate = 0.01 and n value 100 the Log Loss score is 2.721616
For learning rate = 0.01 and n value 250 the Log Loss score is 2.941801
For learning rate = 0.05 and n value 50 the Log Loss score is 2.939409
For learning rate = 0.05 and n value 100 the Log Loss score is 3.189014
For learning rate = 0.05 and n value 250 the Log Loss score is 3.434238
For learning rate = 0.1 and n value 50 the Log Loss score is 3.187824
For learning rate = 0.1 and n value 100 the Log Loss score is

In [39]:
# See if changes to max_depth improves the model with basic specific learning rate and n
learn = .01
n = 10
depth_values = [3, 5, 6, 7, 8, 9]
find_max_depth(train_data, train_labels, dev_data, dev_labels, learn, n, depth_values)

For max_depth = 3 the Log Loss score is 2.589809
For max_depth = 5 the Log Loss score is 2.548867
For max_depth = 6 the Log Loss score is 2.527923
For max_depth = 7 the Log Loss score is 2.509453
For max_depth = 8 the Log Loss score is 2.504026
For max_depth = 9 the Log Loss score is 2.521608


In [41]:
# See if changes to max_depth improves the model with a optimal specific learning rate and n
learn = 0.001
n = 100
depth_values = [5, 6, 7, 8, 9, 10]
find_max_depth(train_data, train_labels, dev_data, dev_labels, learn, n, depth_values)

For max_depth = 5 the Log Loss score is 2.548683
For max_depth = 6 the Log Loss score is 2.527825
For max_depth = 7 the Log Loss score is 2.50858
For max_depth = 8 the Log Loss score is 2.502424
For max_depth = 9 the Log Loss score is 2.518338
For max_depth = 10 the Log Loss score is 2.567408


##### Boosted Decision Tree results:

Optimal combination, using data set as of 11/19 (weather added, binarized, normalized), resulting in log loss of 2.502 on dev data is:  
learning rate = 0.001  
n estimators = 100  
max depth =  8  
Higher values of n do not necessarily improve, and sometimes decrease, performance in boosted trees.

Lower values of n=10 and learning rate of 0.01 with max depth of 5 or 6 return similar log loss values with shorter processing time.

So on this dataset, a boosted decision tree does a bit better than logistic regression (2.54) but does not out-perform the random forest (2.40).


## Bagged Trees

In [None]:
# Set up functions for training bagged trees and finding optimal hyperparameters

# TBD, requires writing some code outside of sk learn



## Create zipped csv file of probabilities to submit to Kaggle
This code uses the predicted probabilities from the most recent model trained in this notebook, saved as the variable pp

In [9]:
# Set up predictions for submission to Kaggle 


headers = ["ARSON","ASSAULT","BAD CHECKS","BRIBERY","BURGLARY","DISORDERLY CONDUCT","DRIVING UNDER THE INFLUENCE",
           "DRUG/NARCOTIC","DRUNKENNESS","EMBEZZLEMENT","EXTORTION","FAMILY OFFENSES","FORGERY/COUNTERFEITING",
           "FRAUD","GAMBLING","KIDNAPPING","LARCENY/THEFT","LIQUOR LAWS","LOITERING","MISSING PERSON","NON-CRIMINAL",
           "OTHER OFFENSES","PORNOGRAPHY/OBSCENE MAT","PROSTITUTION","RECOVERED VEHICLE","ROBBERY","RUNAWAY",
           "SECONDARY CODES","SEX OFFENSES FORCIBLE","SEX OFFENSES NON FORCIBLE","STOLEN PROPERTY","SUICIDE",
           "SUSPICIOUS OCC","TREA","TRESPASS","VANDALISM","VEHICLE THEFT","WARRANTS","WEAPON LAWS"]
data = pd.DataFrame(data=pp, 
                    index=[x for x in range(len(test_data_all))], 
                    columns=headers)
data.columns.name ="Id"
print(data)

Id         ARSON   ASSAULT  BAD CHECKS   BRIBERY  BURGLARY  \
0       0.000873  0.041185    0.000029  0.000063  0.013059   
1       0.000873  0.041185    0.000029  0.000063  0.013059   
2       0.000873  0.041185    0.000029  0.000063  0.013059   
3       0.000873  0.041185    0.000029  0.000063  0.013059   
4       0.000873  0.041185    0.000029  0.000063  0.013059   
5       0.000873  0.041185    0.000029  0.000063  0.013059   
6       0.000873  0.041185    0.000029  0.000063  0.013059   
7       0.000873  0.041185    0.000029  0.000063  0.013059   
8       0.000873  0.041185    0.000029  0.000063  0.013059   
9       0.000873  0.041185    0.000029  0.000063  0.013059   
10      0.000873  0.041185    0.000029  0.000063  0.013059   
11      0.000873  0.041185    0.000029  0.000063  0.013059   
12      0.000873  0.041185    0.000029  0.000063  0.013059   
13      0.000873  0.041185    0.000029  0.000063  0.013059   
14      0.000873  0.041185    0.000029  0.000063  0.013059   
15      

Create zipped csv file for Kaggle
#### Update the filename first in all lines of the following code
Add something unique after our names to avoid overwriting other submission files

In [10]:
data.to_csv('Williams_Gascoigne_Vignola_RandomForest1.csv', index_label = "Id")

In [17]:
zip_probs = zipfile.ZipFile("Williams_Gascoigne_Vignola_RandomForest1.zip", "w")
zip_probs.write("Williams_Gascoigne_Vignola_RandomForest1.csv", compress_type=zipfile.ZIP_DEFLATED)
zip_probs.close()### Results from previous datasets and/or model parameters

### Results from previous datasets and/or model parameters

Random forest model on dataset as of 11/18 (weather added, binarized, normalized) with max_depth set to 15 and n_estimators set to 1000 returned log loss on development data of 2.404.