In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.metrics import mean_squared_error

In [2]:
shelters = pd.read_csv('./datasets/cleaned_data/shelters_with_stats.csv')

In [3]:
shelters.head(2)

Unnamed: 0,id,breed,color,dob,sex,date_in,age_in,intact_in,location,intake_type,...,breed_2,pure,obey,reps_lower,reps_upper,height_low_inches,height_high_inches,weight_low_lbs,weight_high_lbs,time_in_shelter
0,A047759,dachshund,Tricolor,1080864000.0,1.0,1396454000.0,10.0,0.0,Austin (TX),surrender,...,dachshund,1.0,0.5,26.0,40.0,7.0,10.0,16.0,32.0,429420.0
1,A134067,shetland sheepdog,Brown/White,876960000.0,1.0,1384593000.0,16.0,0.0,12034 Research Blvd in Austin (TX),public_assist,...,shetland sheepdog,1.0,0.95,1.0,4.0,17.941176,20.908497,42.934641,57.522876,10320.0


In [4]:
shelters.columns

Index(['id', 'breed', 'color', 'dob', 'sex', 'date_in', 'age_in', 'intact_in',
       'location', 'intake_type', 'condition', 'date_out', 'age_out',
       'intact_out', 'outcome', 'age', 'primary_color', 'secondary_color',
       'breed_1', 'breed_2', 'pure', 'obey', 'reps_lower', 'reps_upper',
       'height_low_inches', 'height_high_inches', 'weight_low_lbs',
       'weight_high_lbs', 'time_in_shelter'],
      dtype='object')

In [5]:
shelters['intake_type'].value_counts()

stray            67928
surrender        25501
public_assist    10538
abandoned          339
euth_request       188
Name: intake_type, dtype: int64

# Remove these two lines after they've been added to an earlier file?

In [6]:
# filter out deceased and missing outcomes
shelters = shelters[(shelters['outcome'] != 'deceased') & (shelters['outcome'] != 'missing')]

In [7]:
# create new column for adoption
# 1 = adopted, 0 = owner_return or transfer
shelters['adopted'] = shelters['outcome'].map({'adopted':1, 'owner_return':0, 'transfer':0})

# Logistic Regression

Target is 'adopted'

1=adopted, 0=not adopted

In [8]:
X = shelters[
    [
        "pure",
        "obey",
        "reps_lower",
        "reps_upper",
        "height_low_inches",
        "height_high_inches",
        "weight_low_lbs",
        "weight_high_lbs",
        "age_in",
        # 'age_out',
        "sex",
        "intake_type",
        "intact_in",
    ]
]

y = shelters["adopted"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101, stratify=y)


In [9]:
y.value_counts(normalize=True)

0    0.518431
1    0.481569
Name: adopted, dtype: float64

In [10]:
ct = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore', sparse=False), make_column_selector(dtype_include=object)),
    remainder='passthrough',
    verbose_feature_names_out=False
)

pipe = make_pipeline(ct, StandardScaler(), PolynomialFeatures(), LogisticRegression(max_iter=10_000))

In [11]:
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'columntransformer', 'standardscaler', 'polynomialfeatures', 'logisticregression', 'columntransformer__n_jobs', 'columntransformer__remainder', 'columntransformer__sparse_threshold', 'columntransformer__transformer_weights', 'columntransformer__transformers', 'columntransformer__verbose', 'columntransformer__verbose_feature_names_out', 'columntransformer__onehotencoder', 'columntransformer__onehotencoder__categories', 'columntransformer__onehotencoder__drop', 'columntransformer__onehotencoder__dtype', 'columntransformer__onehotencoder__handle_unknown', 'columntransformer__onehotencoder__sparse', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'polynomialfeatures__degree', 'polynomialfeatures__include_bias', 'polynomialfeatures__interaction_only', 'polynomialfeatures__order', 'logisticregression__C', 'logisticregression__class_weight', 'logisticregression__dual', 'logisticregression__fit_intercept', 'logisticregre

In [12]:
params = {
    # 'logisticregression__C': [.1, 1],
    # 'polynomialfeatures__degree': [2, 3],
    # 'polynomialfeatures__interaction_only': [True, False]
}

gs_logreg = GridSearchCV(pipe, params)
gs_logreg.fit(X_train, y_train)
gs_logreg.best_params_

{}

In [13]:
preds = gs_logreg.predict(X_test)

print('Train Accuracy: ', gs_logreg.score(X_train, y_train))
print(' Test Accuracy: ', gs_logreg.score(X_test, y_test))

Train Accuracy:  0.6709062626632332
 Test Accuracy:  0.6692678718481628


#### Features used:

"pure",
"obey",
"reps_lower",
"reps_upper",
"height_low_inches",
"height_high_inches",
"weight_low_lbs",
"weight_high_lbs",
"age_in",
"sex",
"intake_type",
"intact_in"

#### Results:

These features + default params resulted in the score below for LogisticRegression.

Train Accuracy: 0.6709062626632332

Test Accuracy: 0.6692678718481628

Baseline: 0.518431


In [14]:
coefs = gs_logreg.best_estimator_.named_steps['logisticregression'].coef_.squeeze()
cols = gs_logreg.best_estimator_.named_steps['columntransformer'].get_feature_names_out()
# cols = X_test.columns
pd.DataFrame(zip(cols, coefs)).sort_values(1)

Unnamed: 0,0,1
10,height_high_inches,-0.537661
14,sex,-0.468589
9,height_low_inches,-0.104804
4,intake_type_surrender,-0.095037
15,intact_in,-0.09477
0,intake_type_abandoned,-0.055364
12,weight_high_lbs,-0.035166
6,obey,-0.034195
3,intake_type_stray,-0.02136
13,age_in,-0.017978


# Linear Regression

Target is 'time_in_shelter'

In [16]:
X = shelters[
    [
        "pure",
        "obey",
        "reps_lower",
        "reps_upper",
        "height_low_inches",
        "height_high_inches",
        "weight_low_lbs",
        "weight_high_lbs",
        'age_in',
        'age_out',
        "sex"
    ]
]

y = shelters['time_in_shelter']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101)

In [17]:
y.mean()/86400

116.51874286506364

In [18]:
# ct = make_column_transformer(
#     (OneHotEncoder(handle_unknown='ignore', sparse=False), make_column_selector(dtype_include=object)),
#     remainder='passthrough',
#     verbose_feature_names_out=False
# )

pipe = make_pipeline(StandardScaler(), PolynomialFeatures(), LinearRegression())

In [19]:
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'standardscaler', 'polynomialfeatures', 'linearregression', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'polynomialfeatures__degree', 'polynomialfeatures__include_bias', 'polynomialfeatures__interaction_only', 'polynomialfeatures__order', 'linearregression__copy_X', 'linearregression__fit_intercept', 'linearregression__n_jobs', 'linearregression__normalize', 'linearregression__positive'])

In [20]:
params = {
    # 'polynomialfeatures__degree': [2, 3],
    # 'polynomialfeatures__interaction_only': [True, False]
}

gs_linreg = GridSearchCV(pipe, params)
gs_linreg.fit(X_train, y_train)
gs_linreg.best_params_

{}

In [21]:
preds = gs_linreg.predict(X_test)

print('Train R2: ', gs_linreg.score(X_train, y_train))
print(' Test R2: ', gs_linreg.score(X_test, y_test))
print('    RMSE: ', mean_squared_error(y_test, preds, squared=False)/86400)

Train R2:  0.758828467755256
 Test R2:  0.7621540321434512
    RMSE:  138.24797968723152


In [22]:
coefs = gs_linreg.best_estimator_.named_steps['linearregression'].coef_
cols = X_test.columns
pd.DataFrame(zip(cols, coefs)).sort_values(1)

Unnamed: 0,0,1
4,height_low_inches,-12942600.0
2,reps_lower,-7782757.0
8,age_in,-1631688.0
6,weight_low_lbs,-1360169.0
9,age_out,166400.0
7,weight_high_lbs,1905104.0
5,height_high_inches,2065270.0
10,sex,2482176.0
3,reps_upper,4030090.0
0,pure,8033234000.0
