In [41]:
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from scipy import stats

In [42]:
# Import the data
df_adoption_data = pd.read_csv('pet_adoption_data.csv')
df_adoption_data.head()

Unnamed: 0,PetID,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood
0,500,Bird,Parakeet,131,Orange,Large,5.039768,1,0,27,140,0,0
1,501,Rabbit,Rabbit,73,White,Large,16.086727,0,0,8,235,0,0
2,502,Dog,Golden Retriever,136,Orange,Medium,2.076286,0,0,85,385,0,0
3,503,Bird,Parakeet,97,White,Small,3.339423,0,0,61,217,1,0
4,504,Rabbit,Rabbit,123,Gray,Large,20.4981,0,0,28,14,1,0


In [43]:
#drop PetID
df_adoption_data_reconfigure = df_adoption_data.drop(['PetID', 'AdoptionFee', 'TimeInShelterDays', 'WeightKg'], axis = 1)


df_adoption_data_reconfigure

Unnamed: 0,PetType,Breed,AgeMonths,Color,Size,Vaccinated,HealthCondition,PreviousOwner,AdoptionLikelihood
0,Bird,Parakeet,131,Orange,Large,1,0,0,0
1,Rabbit,Rabbit,73,White,Large,0,0,0,0
2,Dog,Golden Retriever,136,Orange,Medium,0,0,0,0
3,Bird,Parakeet,97,White,Small,0,0,1,0
4,Rabbit,Rabbit,123,Gray,Large,0,0,1,0
...,...,...,...,...,...,...,...,...,...
2002,Dog,Poodle,72,Orange,Small,1,0,1,1
2003,Rabbit,Rabbit,124,Brown,Small,1,1,0,0
2004,Rabbit,Rabbit,113,Orange,Small,1,0,0,0
2005,Dog,Labrador,12,Gray,Large,1,0,0,0


In [44]:
df_adoption_data_reconfigure.columns

Index(['PetType', 'Breed', 'AgeMonths', 'Color', 'Size', 'Vaccinated',
       'HealthCondition', 'PreviousOwner', 'AdoptionLikelihood'],
      dtype='object')

In [45]:
# get the features (everything except the "WeightKg" column
X = df_adoption_data_reconfigure.copy().drop(columns=["Vaccinated", "HealthCondition", "PreviousOwner"])
X.head()

Unnamed: 0,PetType,Breed,AgeMonths,Color,Size,AdoptionLikelihood
0,Bird,Parakeet,131,Orange,Large,0
1,Rabbit,Rabbit,73,White,Large,0
2,Dog,Golden Retriever,136,Orange,Medium,0
3,Bird,Parakeet,97,White,Small,0
4,Rabbit,Rabbit,123,Gray,Large,0


In [46]:
from sklearn.preprocessing import OneHotEncoder

In [47]:
#verify the type of data from the dataset
df_adoption_data_reconfigure.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2007 entries, 0 to 2006
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   PetType             2007 non-null   object
 1   Breed               2007 non-null   object
 2   AgeMonths           2007 non-null   int64 
 3   Color               2007 non-null   object
 4   Size                2007 non-null   object
 5   Vaccinated          2007 non-null   int64 
 6   HealthCondition     2007 non-null   int64 
 7   PreviousOwner       2007 non-null   int64 
 8   AdoptionLikelihood  2007 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 141.2+ KB


In [48]:
#categorize all columns from the dataset for better reading
categorical_columns = df_adoption_data_reconfigure.select_dtypes(include=['float','int64','object']).columns.tolist()
categorical_columns

['PetType',
 'Breed',
 'AgeMonths',
 'Color',
 'Size',
 'Vaccinated',
 'HealthCondition',
 'PreviousOwner',
 'AdoptionLikelihood']

In [49]:
encoder = OneHotEncoder(sparse_output=False)

In [50]:
encoded_df = encoder.fit_transform(df_adoption_data_reconfigure[categorical_columns])

In [51]:
encoded_df = pd.DataFrame(encoded_df, columns=encoder.get_feature_names_out(categorical_columns))
encoded_df

Unnamed: 0,PetType_Bird,PetType_Cat,PetType_Dog,PetType_Rabbit,Breed_Golden Retriever,Breed_Labrador,Breed_Parakeet,Breed_Persian,Breed_Poodle,Breed_Rabbit,...,Size_Medium,Size_Small,Vaccinated_0,Vaccinated_1,HealthCondition_0,HealthCondition_1,PreviousOwner_0,PreviousOwner_1,AdoptionLikelihood_0,AdoptionLikelihood_1
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2002,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
2003,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
2004,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
2005,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0


In [53]:
#Concatenate the one_hot_df dataframe with original dataframe
df_encoded = pd.concat([df_adoption_data_reconfigure, encoded_df], axis=1)

#Drop the original categorical columns
df_encoded = df_encoded.drop(categorical_columns, axis=1)

df_encoded.columns

Index(['PetType_Bird', 'PetType_Cat', 'PetType_Dog', 'PetType_Rabbit',
       'Breed_Golden Retriever', 'Breed_Labrador', 'Breed_Parakeet',
       'Breed_Persian', 'Breed_Poodle', 'Breed_Rabbit',
       ...
       'Size_Medium', 'Size_Small', 'Vaccinated_0', 'Vaccinated_1',
       'HealthCondition_0', 'HealthCondition_1', 'PreviousOwner_0',
       'PreviousOwner_1', 'AdoptionLikelihood_0', 'AdoptionLikelihood_1'],
      dtype='object', length=206)

In [55]:
encoded_df.columns

Index(['PetType_Bird', 'PetType_Cat', 'PetType_Dog', 'PetType_Rabbit',
       'Breed_Golden Retriever', 'Breed_Labrador', 'Breed_Parakeet',
       'Breed_Persian', 'Breed_Poodle', 'Breed_Rabbit',
       ...
       'Size_Medium', 'Size_Small', 'Vaccinated_0', 'Vaccinated_1',
       'HealthCondition_0', 'HealthCondition_1', 'PreviousOwner_0',
       'PreviousOwner_1', 'AdoptionLikelihood_0', 'AdoptionLikelihood_1'],
      dtype='object', length=206)

In [57]:
#df_encoded = df_encoded.drop('AdoptionFee_493', axis=1)
#df_encoded.head()

encoded_df = encoded_df.drop(['PreviousOwner_0', 'PreviousOwner_1'], axis=1)
encoded_df.head()

Unnamed: 0,PetType_Bird,PetType_Cat,PetType_Dog,PetType_Rabbit,Breed_Golden Retriever,Breed_Labrador,Breed_Parakeet,Breed_Persian,Breed_Poodle,Breed_Rabbit,...,Color_White,Size_Large,Size_Medium,Size_Small,Vaccinated_0,Vaccinated_1,HealthCondition_0,HealthCondition_1,AdoptionLikelihood_0,AdoptionLikelihood_1
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [58]:
df_encoded.shape

(2007, 206)

In [60]:
features = ["PetType_Bird", "PetType_Cat", "PetType_Dog", "PetType_Rabbit", "Breed_Golden Retriever", "Breed_Labrador", "Breed_Parakeet", "Breed_Persian", "Breed_Poodle",
            "Breed_Rabbit", "AdoptionLikelihood_0", "AdoptionLikelihood_1"]
X = encoded_df[features]
X.head()

Unnamed: 0,PetType_Bird,PetType_Cat,PetType_Dog,PetType_Rabbit,Breed_Golden Retriever,Breed_Labrador,Breed_Parakeet,Breed_Persian,Breed_Poodle,Breed_Rabbit,AdoptionLikelihood_0,AdoptionLikelihood_1
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [61]:
#git the target variables: since they are only 2 possibilities, remove one.
y = encoded_df['AdoptionLikelihood_0']
y.head()

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: AdoptionLikelihood_0, dtype: float64

In [33]:
#splitting into training and testing sets
from sklearn.model_selection import train_test_split

In [62]:
# now split the data into training sets again
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [63]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1204, 12)
(803, 12)
(1204,)
(803,)


In [64]:
# Import `LogisticRegression` from sklearn
from sklearn.linear_model import LogisticRegression

In [65]:
model = LogisticRegression()

In [66]:
model = LogisticRegression()

In [67]:
prediction = model.predict(X_test)
prediction[:5]

array([1., 1., 1., 1., 1.])

In [68]:
y_test[:5]

1960    1.0
526     1.0
393     1.0
1402    1.0
433     1.0
Name: AdoptionLikelihood_0, dtype: float64

In [3]:
#get the accuracy
from sklearn.metrics import accuracy_score

