# Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

# Import Data

In [2]:
data = pd.read_csv('./datasets/working_data/model_data_full.csv')

# Select Target

In [3]:
y = data[data['time_in_shelter'] > 7]['at_risk']
X = data[data['time_in_shelter'] > 7].drop(columns=['at_risk'])

# Train/Test Models

Select features

In [4]:
numeric_cols = ['age_in', 'obey', 'height_low_inches', 'height_high_inches', 'weight_low_lbs', 'weight_high_lbs']   # 82.5
# numeric_cols = ['age_in', 'obey', 'height_low_inches', 'height_high_inches']    # no change

In [5]:
cat_features = ['age']

In [6]:
X_dum = pd.concat([
    # X[numeric_cols], 
    pd.get_dummies(X[cat_features]), 
    # pd.get_dummies(X['breed_2']),
    pd.get_dummies(X['breed_1']),
    # pd.get_dummies(X['primary_color'], prefix='color'), 
    pd.get_dummies(X['condition'], prefix='condition'), 
    # pd.get_dummies(X['intake_type'], prefix='intake')
    ], axis=1)

Split data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_dum, y, random_state=123, stratify=y)

Baseline

In [8]:
y.value_counts(normalize=True)

1    0.540995
0    0.459005
Name: at_risk, dtype: float64

Train model

In [9]:
rfc = RandomForestClassifier(n_estimators=300, max_depth=200, random_state=123, min_samples_split=10)
rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=200, min_samples_split=10, n_estimators=300,
                       random_state=123)

We found some signal!!

In [10]:
print(rfc.score(X_train, y_train))
print(rfc.score(X_test, y_test))

0.6570870214967237
0.6324137931034483


Found some success with the random forest classifier. Roughly 9% above baseline. Using a decision tree makes sense give the breakdown of our data, as it is mostly sparse classifiers. Regularized trees can more effectively isolate the features with signal than a linear model, even with L1 dropout.

Will build a stripped version of the model for production purposes and try to improve model performance with more advanced decision trees.

[Run Next](https://github.com/gwoodstock/project4/blob/main/8_modela_rfc_stripped.ipynb): Stripped Random Forest Classifier.