In [11]:
from load_and_reduce import load_and_reduce
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import time 
from sklearn.model_selection import cross_val_score, GridSearchCV

In [2]:
def run_and_time_rf(X_train,X_test,y_train,y_test, n_estimators = 100, max_depth = None, min_samples_split = 2, n_jobs = None):
    
    print("starting train")
    start_train = time.time()

    forest = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, min_samples_split = min_samples_split, n_jobs = n_jobs)
    
    #forest = RandomForestRegressor()
    forest.fit(X_train,y_train)

    end_train = time.time()

    train_time = (end_train - start_train)

    print("starting predictions")
    start_pred = time.time()

    forest_y_pred = forest.predict(X_test)

    end_pred = time.time()

    prediction_time = (end_pred - start_pred)

    forest_y_pred_train = forest.predict(X_train)
    
    print("R^2 on training data: %.3f" %(r2_score(y_train,forest_y_pred_train)))
    print("R^2 on test data: %.3f" %(r2_score(y_test,forest_y_pred)))
    print("MSE: %.3f"% mean_squared_error(y_test,forest_y_pred))
    print("train time (s) =", train_time)
    print("prediction time (s) =", prediction_time)
    
    

In [9]:
#from https://medium.datadriveninvestor.com/random-forest-regression-9871bc9a25eb

def rfr_model(X, y):
    # Perform Grid-Search
    gsc = GridSearchCV(
    estimator=RandomForestRegressor(),
        param_grid={
            
            'n_estimators': (50, 200,1000),
            'min_samples_split': (2, 30, 100),
            'max_depth': (None, 5, 15, 30),
        },
        cv=5, scoring='neg_mean_squared_error', verbose=1)
    
    grid_result = gsc.fit(X, y)
    best_params = grid_result.best_params_
  

    return best_params

# Load and Process Data

In [3]:

#load data
intake_path = "Austin_Animal_Center_Intakes.csv"
outcome_path = "Austin_Animal_Center_Outcomes.csv"

start_load = time.time()

X, y = load_and_reduce(intake_path, outcome_path, coding = "onehot", scale = True, DimRed = "None")

end_load = time.time()

time_load = end_load - start_load

print("Data loading and processing time =", time_load)

Begin Feature Engineering

Starting Cardinality of Breed 2419
Starting Cardinality of Color 562

Cardinality of Breed After Removing Mix and / 246
Cardinality of Color After Removing / 58

Cardinality of Breed After Boiling Down Rare Breeds 105
Cardinality of Color After Boiling Down Rare Colors 41

Total Dimensions of X before Encoding (103049, 12)

Encoding Columns ['Intake Type', 'Intake Condition', 'Animal Type', 'Outcome Type', 'Sex upon Outcome', 'Sex upon Intake', 'Breed', 'Color']
Droping Columns []

Intake Type Cardinality -  5

Intake Condition Cardinality -  10

Animal Type Cardinality -  2

Outcome Type Cardinality -  9

Sex upon Outcome Cardinality -  5

Sex upon Intake Cardinality -  5

Breed Cardinality -  105

Color Cardinality -  41

Scaling ['Age (Days)']
Data loading and processing time = 14.420130491256714


## Train/Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
 X, y, test_size=0.33, random_state=42)

# Do Grid Search

In [10]:
rfr_model(X,y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed: 1013.6min finished


{'max_depth': 15, 'min_samples_split': 100, 'n_estimators': 1000}

# Run Models

## Random Forest With Parameters From Grid Search

In [12]:
run_and_time_rf(X_train, X_test, y_train, y_test, n_estimators = 1000, max_depth = 15, min_samples_split = 100)

starting train
starting predictions
R^2 on training data: 0.299
R^2 on test data: 0.217
MSE: 1502.673
train time (s) = 775.9548621177673
prediction time (s) = 2.907144784927368


In [None]:
run_and_time_rf(X_train, X_test, y_train, y_test, n_estimators = 100)

starting train
