This notebook trains random forest model and also several different decision tree models.

In [1]:
import numpy as np
import pandas as pd
import zipfile
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier 

In [None]:
# Unzip data files into the "csv" subdirectory 
# (unless this has already done this since running the Data Set Up notebook)

# **IMPORTANT**  This will overwrite existing files in the "csv" folder in your local repo
# with the most recent data files from the data.zip file

# Unzip 80% training data
unzip_training_data = zipfile.ZipFile("data_subset.zip", "r")
unzip_training_data.extractall()
unzip_training_data.close()

# Unzip development and training data
unzip_test_data = zipfile.ZipFile("testing.zip", "r")
unzip_test_data.extractall()
unzip_test_data.close()

# Unzip full set of training data for creating predictions to submit to Kaggle
unzip_all_data = zipfile.ZipFile("data.zip", "r")
unzip_all_data.extractall()
unzip_all_data.close()

In [2]:
# Load these csv files into numpy arrays for testing on development data
train_data = np.loadtxt('csv/train_data.csv', delimiter=",")
train_labels = np.loadtxt('csv/train_labels.csv', dtype=str, delimiter=",")
dev_data = np.loadtxt('csv/dev_data.csv', delimiter=",")
dev_labels = np.loadtxt('csv/dev_labels.csv', dtype=str, delimiter=",")

In [None]:
# Load these csv files into numpy arrays for creating predictions to submit to Kaggle
train_data_all = np.loadtxt('csv/train_data_all.csv', delimiter=",")
train_labels_all = np.loadtxt('csv/train_labels_all.csv', dtype=str, delimiter=",")
test_data_all = np.loadtxt('csv/test_data_all.csv', delimiter=",")

In [4]:
# print shapes to compare before and after csv conversion
print("train_data shape is", train_data.shape)
print("train_labels shape is", train_labels.shape)
print("dev_data shape is", dev_data.shape)
print("dev_labels shape is", dev_labels.shape)

train_data shape is (702439, 58)
train_labels shape is (702439,)
dev_data shape is (175610, 58)
dev_labels shape is (175610,)


In [None]:
print("train_data_all shape is", train_data_all.shape)
print("train_labels_all shape is", train_labels_all.shape)
print("test_data_all shape is", test_data_all.shape)

In [None]:
# IF there are additional changes to make to the data for this model
# that would be easier to do in pandas, uncomment and run this code. 
# This model works the same whether the data is in numpy or pandas, so presumably so do other models

#train_data = pd.DataFrame(train_data)
#train_labels = pd.DataFrame(train_labels)
#dev_data = pd.DataFrame(dev_data)
#dev_labels = pd.DataFrame(dev_labels)
#train_data_all = pd.DataFrame(train_data_all)
#train_labels_all = pd.DataFrame(train_labels_all)
#test_data_all = pd.DataFrame(test_data_all)

## Random forest

In [9]:
# Set up functions for training random forest and finding optimal number of estimators

def TrainRF(data, labels, test_data, n=10):
    """This function takes in training data and labels, testing data,
    and can accept different values of n (the number of random decision trees to create)
    It trains a random forest model and returns the model and predicted probabilities.
    """
    RF = RandomForestClassifier(n_estimators=n)
    RF.fit(data, labels)
    pp = RF.predict_proba(test_data)
    return RF, pp

def find_n(data, labels, dev_data, dev_labels, n_values):
    """Find optimal value of n in a random forest model.  
    
    Note that this cannot be used on test data from Kaggle 
    because we do not have labels for that data.  This function is intended to only be used
    in the development stage with the development data.
    """
    for n in n_values:      
        RF, pp = TrainRF(data, labels, dev_data, n)
        predictions = RF.predict(dev_data)
        f1 = metrics.f1_score(dev_labels, predictions, average = "weighted")
        logloss = metrics.log_loss(dev_labels, pp)
        
        # Print F1 score and log loss for each value of k
        print("For n =", n, "the F1 score is", round(f1, 6), "and the Log Loss score is", round(logloss, 6))
    print("\n")

In [10]:
# Train model with a default value of n=10
RF, pp = TrainRF(train_data, train_labels, dev_data)
logloss = metrics.log_loss(dev_labels, pp)
print(logloss)

13.7449184017


In [None]:
# Find the optimal value of n using the 80% training data and the development data
n_values = [10, 100]
find_n(train_data, train_labels, dev_data, dev_labels, n_values)

  'precision', 'predicted', average, warn_for)


For n = 10 the F1 score is 0.260758 and the Log Loss score is 13.860644
For n = 100 the F1 score is 0.278999 and the Log Loss score is 5.154798


In [None]:
# Find the optimal value of n using the 80% training data and the development data
# Doing higher numbers separately
n_values = [500, 1000]
find_n(train_data, train_labels, dev_data, dev_labels, n_values)

## Bagged trees

In [None]:
# Set up functions for training bagged trees and finding optimal number of estimators

def TrainRF(data, labels, test_data, n=10):
    """This function takes in training data and labels, testing data,
    and can accept different values of n (the number of random decision trees to create)
    It trains a random forest model and returns the model and predicted probabilities.
    """
    RF = RandomForestClassifier(n_estimators=n)
    RF.fit(data, labels)
    pp = RF.predict_proba(test_data)
    return RF, pp

def find_n(data, labels, dev_data, dev_labels, n_values):
    """Find optimal value of n in a random forest model.  
    
    Note that this cannot be used on test data from Kaggle 
    because we do not have labels for that data.  This function is intended to only be used
    in the development stage with the development data.
    """
    for n in n_values:      
        RF, pp = TrainRF(data, labels, dev_data, n)
        predictions = RF.predict(dev_data)
        f1 = metrics.f1_score(dev_labels, predictions, average = "weighted")
        logloss = metrics.log_loss(dev_labels, pp)
        
        # Print F1 score and log loss for each value of k
        print("For n =", n, "the F1 score is", round(f1, 6), "and the Log Loss score is", round(logloss, 6))
    print("\n")