# Import Libraries

In [99]:
import pandas as pd
import numpy as np
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Feature Selection

In [100]:
target = ['time_in_shelter']
features = ['age', 'breed_1', 'condition', 'primary_color']
filters = ['intake_type', 'condition', 'outcome']

# Import Data

In [101]:
data = pd.read_csv('./datasets/cleaned_data/shelters_with_stats.csv', usecols=features + target + filters)

In [102]:
data.head(2)

Unnamed: 0,intake_type,condition,outcome,age,primary_color,breed_1,time_in_shelter
0,surrender,normal,transfer,Senior,tricolor,dachshund,4 days 23:17:00
1,public_assist,injured,return_owner,Senior,brown,shetland sheepdog,0 days 02:52:00


# Filters

In [103]:
data.rename(columns={
    'condition': 'cond',
    'breed_1': 'breed',
    'time_in_shelter': 'time',
    'primary_color': 'color'
}, inplace=True)

In [104]:
data['time'] = pd.to_timedelta(data['time']).dt.days

In [105]:
data = data[
        (data['time'] < 200) & (data['time'] >=7)                                       &
        (data['intake_type'] != 'euth_request')                                         &
        (~data['outcome'].isin(['return_owner', 'missing', 'disposal', 'transfer']))    &
        (data['cond'] != 'neonatal')
]

In [106]:
data.loc[data['cond'] == 'pregnant', 'cond'] = 'nursing'
data.loc[data['cond'] == 'medical', 'cond'] = 'injured'
data.loc[data['cond'] == 'sick', 'cond'] = 'injured'

data['target'] = np.where(data['time'] > 20, 1, 0)
data.drop(columns=['intake_type', 'outcome', 'time'], inplace=True)

data.dropna(inplace=True)

# Export Data

In [107]:
data.to_csv('./datasets/cleaned_data/model_data.csv', index=False)

# Model

In [108]:
y = data['target']
X = data.drop('target', axis=1)

In [109]:
ohe = OneHotEncoder().fit(X[['age', 'breed', 'cond']])
X = ohe.transform(X[['age', 'breed', 'cond']])

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, stratify=y)

In [111]:
rfc = RandomForestClassifier(n_estimators=300, max_depth=200, random_state=123, min_samples_split=10)
rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=200, min_samples_split=10, n_estimators=300,
                       random_state=123)

# Score

In [112]:
print(rfc.score(X_train, y_train))
print(rfc.score(X_test, y_test))

0.6531504065040651
0.6387195121951219


In [113]:
with open("./models/model_rfc.pkl", "wb") as file:
    pickle.dump(rfc, file)
with open("./models/encode_data.pkl", "wb") as file:
    pickle.dump(ohe, file)