In [1]:
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from scipy import stats

In [2]:
# Import the data
df_adoption_data = pd.read_csv('pet_adoption_data.csv')
df_adoption_data.head()

Unnamed: 0,PetID,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood
0,500,Bird,Parakeet,131,Orange,Large,5.039768,1,0,27,140,0,0
1,501,Rabbit,Rabbit,73,White,Large,16.086727,0,0,8,235,0,0
2,502,Dog,Golden Retriever,136,Orange,Medium,2.076286,0,0,85,385,0,0
3,503,Bird,Parakeet,97,White,Small,3.339423,0,0,61,217,1,0
4,504,Rabbit,Rabbit,123,Gray,Large,20.4981,0,0,28,14,1,0


In [3]:
#drop PetID
df_adoption_data_reconfigure = df_adoption_data.drop(['PetID'], axis = 1)


df_adoption_data_reconfigure

Unnamed: 0,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood
0,Bird,Parakeet,131,Orange,Large,5.039768,1,0,27,140,0,0
1,Rabbit,Rabbit,73,White,Large,16.086727,0,0,8,235,0,0
2,Dog,Golden Retriever,136,Orange,Medium,2.076286,0,0,85,385,0,0
3,Bird,Parakeet,97,White,Small,3.339423,0,0,61,217,1,0
4,Rabbit,Rabbit,123,Gray,Large,20.498100,0,0,28,14,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2002,Dog,Poodle,72,Orange,Small,27.039045,1,0,66,26,1,1
2003,Rabbit,Rabbit,124,Brown,Small,4.726954,1,1,59,150,0,0
2004,Rabbit,Rabbit,113,Orange,Small,1.758592,1,0,68,302,0,0
2005,Dog,Labrador,12,Gray,Large,20.961592,1,0,59,478,0,0


In [4]:
df_adoption_data_reconfigure.columns

Index(['PetType', 'Breed', 'AgeMonths', 'Color', 'Size', 'WeightKg',
       'Vaccinated', 'HealthCondition', 'TimeInShelterDays', 'AdoptionFee',
       'PreviousOwner', 'AdoptionLikelihood'],
      dtype='object')

In [5]:
# get the features (everything except the "WeightKg" column
X = df_adoption_data_reconfigure.copy().drop(columns=["WeightKg", "Vaccinated", "HealthCondition", "PreviousOwner"])
X.head()

Unnamed: 0,PetType,Breed,AgeMonths,Color,Size,TimeInShelterDays,AdoptionFee,AdoptionLikelihood
0,Bird,Parakeet,131,Orange,Large,27,140,0
1,Rabbit,Rabbit,73,White,Large,8,235,0
2,Dog,Golden Retriever,136,Orange,Medium,85,385,0
3,Bird,Parakeet,97,White,Small,61,217,0
4,Rabbit,Rabbit,123,Gray,Large,28,14,0


In [6]:
#get the target column
y = df_adoption_data_reconfigure["WeightKg"]
y.head()

0     5.039768
1    16.086727
2     2.076286
3     3.339423
4    20.498100
Name: WeightKg, dtype: float64

In [7]:
# Use the Sklearn `train_test_split()` function to split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
#review the x-train dataframe
X_train.head()

Unnamed: 0,PetType,Breed,AgeMonths,Color,Size,TimeInShelterDays,AdoptionFee,AdoptionLikelihood
408,Dog,Golden Retriever,46,White,Small,10,195,0
1619,Bird,Parakeet,119,Gray,Medium,65,423,1
1454,Bird,Parakeet,121,Black,Large,86,375,0
741,Bird,Parakeet,109,Gray,Medium,30,462,0
1682,Dog,Poodle,75,White,Medium,75,464,1


In [9]:
# Display y_train
y_train.head()

408     27.562114
1619    11.762077
1454    26.054690
741      7.589023
1682     7.173933
Name: WeightKg, dtype: float64

In [10]:
df_adoption_data_reconfigure = pd.read_csv('pet_adoption_data.csv')
df_adoption_data_reconfigure.corr(numeric_only=True)

Unnamed: 0,PetID,AgeMonths,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood
PetID,1.0,-0.008464,0.034315,0.008808,0.010486,0.017683,0.010557,0.001339,0.049582
AgeMonths,-0.008464,1.0,-0.029189,-0.001801,0.0172,0.036837,-0.038129,0.034943,-0.202209
WeightKg,0.034315,-0.029189,1.0,0.022526,0.000708,-0.00098,-0.002367,-0.049155,0.017601
Vaccinated,0.008808,-0.001801,0.022526,1.0,0.024079,-0.001123,0.021662,-0.01146,0.301311
HealthCondition,0.010486,0.0172,0.000708,0.024079,1.0,-0.012664,-0.002537,0.013755,-0.244061
TimeInShelterDays,0.017683,0.036837,-0.00098,-0.001123,-0.012664,1.0,-0.007104,0.007958,0.008867
AdoptionFee,0.010557,-0.038129,-0.002367,0.021662,-0.002537,-0.007104,1.0,0.009755,-0.005486
PreviousOwner,0.001339,0.034943,-0.049155,-0.01146,0.013755,0.007958,0.009755,1.0,-0.023065
AdoptionLikelihood,0.049582,-0.202209,0.017601,0.301311,-0.244061,0.008867,-0.005486,-0.023065,1.0


In [11]:
df_adoption_data_reconfigure.HealthCondition.corr(df_adoption_data.TimeInShelterDays)

-0.012663548841906182

In [12]:
corr = stats.pearsonr(df_adoption_data_reconfigure.HealthCondition, df_adoption_data_reconfigure.WeightKg)
corr

PearsonRResult(statistic=0.0007084988286332552, pvalue=0.974694818111588)

In [14]:
corr_df_adoption_data_reconfigure = pd.DataFrame(columns=['r', 'p'])

for col in df_adoption_data_reconfigure:
    print(col)
    if pd.api.types.is_numeric_dtype(df_adoption_data_reconfigure[col]) and col != 'WeightKg':
        r, p =stats.pearsonr(df_adoption_data_reconfigure.HealthCondition, df_adoption_data_reconfigure[col])
        corr_df_adoption_data_reconfigure[col] = [round(r, 2), round(p, 2)]
corr_df_adoption_data_reconfigure

PetID
PetType
Breed
AgeMonths
Color
Size
WeightKg
Vaccinated
HealthCondition
TimeInShelterDays
AdoptionFee
PreviousOwner
AdoptionLikelihood


Unnamed: 0,r,p,PetID,AgeMonths,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood
0,,,0.01,0.02,0.02,1.0,-0.01,-0.0,0.01,-0.24
1,,,0.64,0.44,0.28,0.0,0.57,0.91,0.54,0.0


In [15]:
# total of days spend in the shelter per breed
# using value_counts
Total_time_spend = df_adoption_data_reconfigure['TimeInShelterDays'].value_counts()
Total_time_spend


TimeInShelterDays
15    40
52    36
21    32
6     30
79    30
      ..
32    14
39    14
16    12
80    12
11     9
Name: count, Length: 89, dtype: int64

In [16]:
#Total days spend by all the pets
number_of_days_in_Shelter = df_adoption_data_reconfigure['TimeInShelterDays']
print(sum(number_of_days_in_Shelter))

88256


In [17]:
#Average days spend in the shelter before adoption
Average_day_in_Shelter = len(number_of_days_in_Shelter)
Average_day_in_Shelter

2007

In [18]:
avr = number_of_days_in_Shelter / Average_day_in_Shelter
avr

0       0.013453
1       0.003986
2       0.042352
3       0.030394
4       0.013951
          ...   
2002    0.032885
2003    0.029397
2004    0.033881
2005    0.029397
2006    0.004983
Name: TimeInShelterDays, Length: 2007, dtype: float64

In [19]:
df_adoption_data_reconfigure['AdoptionLikelihood']

0       0
1       0
2       0
3       0
4       0
       ..
2002    1
2003    0
2004    0
2005    0
2006    0
Name: AdoptionLikelihood, Length: 2007, dtype: int64

In [20]:
from sklearn.preprocessing import OneHotEncoder

In [21]:
#verify the type of data from the dataset
df_adoption_data_reconfigure.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2007 entries, 0 to 2006
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   PetID               2007 non-null   int64  
 1   PetType             2007 non-null   object 
 2   Breed               2007 non-null   object 
 3   AgeMonths           2007 non-null   int64  
 4   Color               2007 non-null   object 
 5   Size                2007 non-null   object 
 6   WeightKg            2007 non-null   float64
 7   Vaccinated          2007 non-null   int64  
 8   HealthCondition     2007 non-null   int64  
 9   TimeInShelterDays   2007 non-null   int64  
 10  AdoptionFee         2007 non-null   int64  
 11  PreviousOwner       2007 non-null   int64  
 12  AdoptionLikelihood  2007 non-null   int64  
dtypes: float64(1), int64(8), object(4)
memory usage: 204.0+ KB


In [22]:
#categorize all columns from the dataset for better reading
categorical_columns = df_adoption_data_reconfigure.select_dtypes(include=['float','int64','object']).columns.tolist()
categorical_columns

['PetID',
 'PetType',
 'Breed',
 'AgeMonths',
 'Color',
 'Size',
 'WeightKg',
 'Vaccinated',
 'HealthCondition',
 'TimeInShelterDays',
 'AdoptionFee',
 'PreviousOwner',
 'AdoptionLikelihood']

In [23]:
encoder = OneHotEncoder(sparse_output=False)

In [24]:
encoded_df = encoder.fit_transform(df_adoption_data_reconfigure[categorical_columns])

In [25]:
one_hot_df = pd.DataFrame(encoded_df, columns=encoder.get_feature_names_out(categorical_columns))
one_hot_df

Unnamed: 0,PetID_500,PetID_501,PetID_502,PetID_503,PetID_504,PetID_505,PetID_506,PetID_507,PetID_508,PetID_509,...,AdoptionFee_493,AdoptionFee_494,AdoptionFee_496,AdoptionFee_497,AdoptionFee_498,AdoptionFee_499,PreviousOwner_0,PreviousOwner_1,AdoptionLikelihood_0,AdoptionLikelihood_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [26]:
one_hot_df.columns

Index(['PetID_500', 'PetID_501', 'PetID_502', 'PetID_503', 'PetID_504',
       'PetID_505', 'PetID_506', 'PetID_507', 'PetID_508', 'PetID_509',
       ...
       'AdoptionFee_493', 'AdoptionFee_494', 'AdoptionFee_496',
       'AdoptionFee_497', 'AdoptionFee_498', 'AdoptionFee_499',
       'PreviousOwner_0', 'PreviousOwner_1', 'AdoptionLikelihood_0',
       'AdoptionLikelihood_1'],
      dtype='object', length=4802)

In [27]:
encoded_df.shape

(2007, 4802)

In [28]:
#get the features
features = ["PetType_Bird", "PetType_Cat", "PetType_Dog", "PetType_Rabbit", "Breed_Golden Retriever", "Breed_Labrador", "Breed_Parakeet", "Breed_Persian", "Breed_Poodle",
            "Breed_Rabbit", "AdoptionLikelihood_0", "AdoptionLikelihood_1"]
X = one_hot_df[features]
X.head()

Unnamed: 0,PetType_Bird,PetType_Cat,PetType_Dog,PetType_Rabbit,Breed_Golden Retriever,Breed_Labrador,Breed_Parakeet,Breed_Persian,Breed_Poodle,Breed_Rabbit,AdoptionLikelihood_0,AdoptionLikelihood_1
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [29]:
# now split the data into training sets again
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [30]:
# Preview X_train
X_train.head()

Unnamed: 0,PetType_Bird,PetType_Cat,PetType_Dog,PetType_Rabbit,Breed_Golden Retriever,Breed_Labrador,Breed_Parakeet,Breed_Persian,Breed_Poodle,Breed_Rabbit,AdoptionLikelihood_0,AdoptionLikelihood_1
1106,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
105,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
126,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1375,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1134,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [32]:
#features = ['Vaccinated', 'AdoptionFee_493','AdoptionFee_494','AdoptionFee_496', 'AdoptionFee_497', 'AdoptionFee_498','AdoptionFee_499', "PreviousOwner_0", 'PreviousOwner_1',  'AdoptionLikelihood_0', 'AdoptionLikelihood_1']
#y = one_hot_df[features]
#y.head()