# Import Libraries

In [2]:
# Import our libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import from sklearn.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

# PCA
from sklearn.decomposition import PCA


# Helper Functions

In [3]:
def quick_look(x):
    print(data[x].value_counts())
    print(f'\nMissing Values: {data[x].isna().sum()}')

# Import Data

In [4]:
data = pd.read_csv('./datasets/cleaned_data/shelters_with_stats.csv')

In [5]:
data.head()

Unnamed: 0,id,breed,color,dob,sex,date_in,age_in,intact_in,location,intake_type,...,breed_2,pure,obey,reps_lower,reps_upper,height_low_inches,height_high_inches,weight_low_lbs,weight_high_lbs,time_in_shelter
0,A047759,dachshund,Tricolor,1080864000.0,1.0,1396454000.0,10.0,0.0,Austin (TX),surrender,...,dachshund,1.0,0.5,26.0,40.0,7.0,10.0,16.0,32.0,429420.0
1,A134067,shetland sheepdog,Brown/White,876960000.0,1.0,1384593000.0,16.0,0.0,12034 Research Blvd in Austin (TX),public_assist,...,shetland sheepdog,1.0,0.95,1.0,4.0,17.941176,20.908497,42.934641,57.522876,10320.0
2,A141142,labrador retriever/pit bull,Black/White,896659200.0,0.0,1384613000.0,15.0,0.0,Austin (TX),stray,...,pit bull,0.0,0.825,8.5,14.5,19.0,21.5,47.5,65.0,75240.0
3,A163459,miniature schnauzer,Black/Gray,940291200.0,0.0,1415978000.0,15.0,1.0,Ih 35 And 41St St in Austin (TX),stray,...,miniature schnauzer,1.0,0.85,5.0,15.0,17.941176,20.908497,42.934641,57.522876,15420.0
4,A165752,lhasa apso,Brown/White,934934400.0,1.0,1410780000.0,15.0,0.0,Gatlin Gun Rd And Brodie in Austin (TX),stray,...,lhasa apso,1.0,0.3,41.0,80.0,17.941176,20.908497,42.934641,57.522876,18420.0


In [6]:
data.columns

Index(['id', 'breed', 'color', 'dob', 'sex', 'date_in', 'age_in', 'intact_in',
       'location', 'intake_type', 'condition', 'date_out', 'age_out',
       'intact_out', 'outcome', 'age', 'primary_color', 'secondary_color',
       'breed_1', 'breed_2', 'pure', 'obey', 'reps_lower', 'reps_upper',
       'height_low_inches', 'height_high_inches', 'weight_low_lbs',
       'weight_high_lbs', 'time_in_shelter'],
      dtype='object')

In [7]:
data.drop(columns=['id', 'color', 'breed', 'dob', 'date_in', 'intact_in', 'location', 'date_out', 'age_out', 
                   'outcome', 'age_out', 'age', 'secondary_color', 'breed_2'], inplace=True)


In [8]:
data.head()

Unnamed: 0,sex,age_in,intake_type,condition,intact_out,primary_color,breed_1,pure,obey,reps_lower,reps_upper,height_low_inches,height_high_inches,weight_low_lbs,weight_high_lbs,time_in_shelter
0,1.0,10.0,surrender,wnl,0.0,tricolor,dachshund,1.0,0.5,26.0,40.0,7.0,10.0,16.0,32.0,429420.0
1,1.0,16.0,public_assist,med_attn,0.0,brown,shetland sheepdog,1.0,0.95,1.0,4.0,17.941176,20.908497,42.934641,57.522876,10320.0
2,0.0,15.0,stray,aged,0.0,black,labrador retriever,0.0,0.825,8.5,14.5,19.0,21.5,47.5,65.0,75240.0
3,0.0,15.0,stray,wnl,1.0,black,miniature schnauzer,1.0,0.85,5.0,15.0,17.941176,20.908497,42.934641,57.522876,15420.0
4,1.0,15.0,stray,wnl,0.0,brown,lhasa apso,1.0,0.3,41.0,80.0,17.941176,20.908497,42.934641,57.522876,18420.0


In [9]:
quick_look('sex')

1.0    58049
0.0    46445
Name: sex, dtype: int64

Missing Values: 0


In [10]:
y = data['time_in_shelter']
X = data.drop(columns=['time_in_shelter'])

In [11]:
y.head()

0    429420.0
1     10320.0
2     75240.0
3     15420.0
4     18420.0
Name: time_in_shelter, dtype: float64

In [12]:
X.head()

Unnamed: 0,sex,age_in,intake_type,condition,intact_out,primary_color,breed_1,pure,obey,reps_lower,reps_upper,height_low_inches,height_high_inches,weight_low_lbs,weight_high_lbs
0,1.0,10.0,surrender,wnl,0.0,tricolor,dachshund,1.0,0.5,26.0,40.0,7.0,10.0,16.0,32.0
1,1.0,16.0,public_assist,med_attn,0.0,brown,shetland sheepdog,1.0,0.95,1.0,4.0,17.941176,20.908497,42.934641,57.522876
2,0.0,15.0,stray,aged,0.0,black,labrador retriever,0.0,0.825,8.5,14.5,19.0,21.5,47.5,65.0
3,0.0,15.0,stray,wnl,1.0,black,miniature schnauzer,1.0,0.85,5.0,15.0,17.941176,20.908497,42.934641,57.522876
4,1.0,15.0,stray,wnl,0.0,brown,lhasa apso,1.0,0.3,41.0,80.0,17.941176,20.908497,42.934641,57.522876


Baseline in days

In [13]:
(y/86400).mean()

115.26713810521883

In [14]:
quick_look('intake_type')

stray            67928
surrender        25501
public_assist    10538
abandoned          339
euth_request       188
Name: intake_type, dtype: int64

Missing Values: 0


In [15]:
quick_look('condition')

wnl             95990
med_attn         6378
preg_nursing     1388
aged              470
other             155
behavioral         60
underage           53
Name: condition, dtype: int64

Missing Values: 0


In [16]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104494 entries, 0 to 104493
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   sex                 104494 non-null  float64
 1   age_in              104494 non-null  float64
 2   intake_type         104494 non-null  object 
 3   condition           104494 non-null  object 
 4   intact_out          104494 non-null  float64
 5   primary_color       104494 non-null  object 
 6   breed_1             104494 non-null  object 
 7   pure                104494 non-null  float64
 8   obey                104494 non-null  float64
 9   reps_lower          104494 non-null  float64
 10  reps_upper          104494 non-null  float64
 11  height_low_inches   104494 non-null  float64
 12  height_high_inches  104494 non-null  float64
 13  weight_low_lbs      104494 non-null  float64
 14  weight_high_lbs     104494 non-null  float64
dtypes: float64(11), object(4)
memory u

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1234)

In [18]:
pd.get_dummies(X['breed_1'])

Unnamed: 0,affenpinscher,afghan hound,airedale terrier,akita,alaskan malamute,american eskimo,american foxhound,american pit bull terrier,american staffordshire terrier,anatolian sheepdog,...,toy fox terrier,toy poodle,vizsla,weimaraner,welsh springer spaniel,welsh terrier,west highland white terrier,whippet,wirehaired pointing griffon,yorkshire terrier
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104489,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
104490,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
104491,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
104492,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
pd.concat([
    X, 
    pd.get_dummies(X['breed_1']), 
    pd.get_dummies(X['primary_color'], prefix='color'), 
    pd.get_dummies(X['condition'], prefix='condition'), 
    pd.get_dummies(X['intake_type'], prefix='intake')
    ], axis=1)

Unnamed: 0,sex,age_in,intake_type,condition,intact_out,primary_color,breed_1,pure,obey,reps_lower,...,condition_med_attn,condition_other,condition_preg_nursing,condition_underage,condition_wnl,intake_abandoned,intake_euth_request,intake_public_assist,intake_stray,intake_surrender
0,1.0,10.0,surrender,wnl,0.0,tricolor,dachshund,1.0,0.500,26.0,...,0,0,0,0,1,0,0,0,0,1
1,1.0,16.0,public_assist,med_attn,0.0,brown,shetland sheepdog,1.0,0.950,1.0,...,1,0,0,0,0,0,0,1,0,0
2,0.0,15.0,stray,aged,0.0,black,labrador retriever,0.0,0.825,8.5,...,0,0,0,0,0,0,0,0,1,0
3,0.0,15.0,stray,wnl,1.0,black,miniature schnauzer,1.0,0.850,5.0,...,0,0,0,0,1,0,0,0,1,0
4,1.0,15.0,stray,wnl,0.0,brown,lhasa apso,1.0,0.300,41.0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104489,1.0,1.0,surrender,wnl,0.0,tricolor,basenji,1.0,0.100,81.0,...,0,0,0,0,1,0,0,0,0,1
104490,1.0,2.0,public_assist,wnl,0.0,black,german shepherd,1.0,0.950,1.0,...,0,0,0,0,1,0,0,1,0,0
104491,1.0,2.0,public_assist,wnl,0.0,black,german shepherd,1.0,0.950,1.0,...,0,0,0,0,1,0,0,1,0,0
104492,1.0,2.0,surrender,wnl,0.0,black,german shepherd,1.0,0.950,1.0,...,0,0,0,0,1,0,0,0,0,1


In [85]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

ValueError: could not convert string to float: 'surrender'