# Zoo Data Cleaning 

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
from pathlib import Path

# Files to Load 
zoo_data_load = Path("Resources/zoo.csv")

# Read Zoo Data File and store into Pandas DataFrame
zoo_data = pd.read_csv(zoo_data_load)
zoo_data.head()

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [2]:
# Find duplicates in the dataset.
duplicates = zoo_data[zoo_data.duplicated(subset='animal_name')]
print(duplicates)


   animal_name  hair  feathers  eggs  milk  airborne  aquatic  predator  \
26        frog     0         0     1     0         0        1         1   

    toothed  backbone  breathes  venomous  fins  legs  tail  domestic  \
26        1         1         1         1     0     4     0         0   

    catsize  class_type  
26        0           5  


In [3]:
# Drop duplicates
zoo_data.drop_duplicates(subset='animal_name', inplace=True)
zoo_data.head(30)


Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
5,buffalo,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
6,calf,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1,1
7,carp,0,0,1,0,0,1,0,1,1,0,0,1,0,1,1,0,4
8,catfish,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
9,cavy,1,0,0,1,0,0,0,1,1,1,0,0,4,0,1,0,1


In [4]:
# Remove the vampire from the dataset.
zoo_data = zoo_data[zoo_data['animal_name'] != 'vampire']


In [5]:
# Rename the animal_name "girl" to "human".
zoo_data.loc[zoo_data['animal_name'] == 'girl', 'animal_name'] = 'human'
zoo_data.head(30)

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
5,buffalo,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
6,calf,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1,1
7,carp,0,0,1,0,0,1,0,1,1,0,0,1,0,1,1,0,4
8,catfish,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
9,cavy,1,0,0,1,0,0,0,1,1,1,0,0,4,0,1,0,1


In [6]:
# Check the datatypes of the dataset.
zoo_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99 entries, 0 to 100
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   animal_name  99 non-null     object
 1   hair         99 non-null     int64 
 2   feathers     99 non-null     int64 
 3   eggs         99 non-null     int64 
 4   milk         99 non-null     int64 
 5   airborne     99 non-null     int64 
 6   aquatic      99 non-null     int64 
 7   predator     99 non-null     int64 
 8   toothed      99 non-null     int64 
 9   backbone     99 non-null     int64 
 10  breathes     99 non-null     int64 
 11  venomous     99 non-null     int64 
 12  fins         99 non-null     int64 
 13  legs         99 non-null     int64 
 14  tail         99 non-null     int64 
 15  domestic     99 non-null     int64 
 16  catsize      99 non-null     int64 
 17  class_type   99 non-null     int64 
dtypes: int64(17), object(1)
memory usage: 14.7+ KB


In [7]:
# Change the datatype of the boolean columns to bool.
boolean_columns = ['hair', 'feathers', 'eggs', 'milk', 'airborne', 'aquatic', 'predator', 'toothed', 'backbone', 'breathes', 'venomous', 'fins', 'tail', 'domestic', 'catsize']
zoo_data[boolean_columns] = zoo_data[boolean_columns].astype(bool)


In [8]:
zoo_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99 entries, 0 to 100
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   animal_name  99 non-null     object
 1   hair         99 non-null     bool  
 2   feathers     99 non-null     bool  
 3   eggs         99 non-null     bool  
 4   milk         99 non-null     bool  
 5   airborne     99 non-null     bool  
 6   aquatic      99 non-null     bool  
 7   predator     99 non-null     bool  
 8   toothed      99 non-null     bool  
 9   backbone     99 non-null     bool  
 10  breathes     99 non-null     bool  
 11  venomous     99 non-null     bool  
 12  fins         99 non-null     bool  
 13  legs         99 non-null     int64 
 14  tail         99 non-null     bool  
 15  domestic     99 non-null     bool  
 16  catsize      99 non-null     bool  
 17  class_type   99 non-null     int64 
dtypes: bool(15), int64(2), object(1)
memory usage: 4.5+ KB


In [9]:
# Reset the index of the DataFrame.
zoo_data = zoo_data.reset_index(drop=True)
zoo_data.head(30)

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,True,False,False,True,False,False,True,True,True,True,False,False,4,False,False,True,1
1,antelope,True,False,False,True,False,False,False,True,True,True,False,False,4,True,False,True,1
2,bass,False,False,True,False,False,True,True,True,True,False,False,True,0,True,False,False,4
3,bear,True,False,False,True,False,False,True,True,True,True,False,False,4,False,False,True,1
4,boar,True,False,False,True,False,False,True,True,True,True,False,False,4,True,False,True,1
5,buffalo,True,False,False,True,False,False,False,True,True,True,False,False,4,True,False,True,1
6,calf,True,False,False,True,False,False,False,True,True,True,False,False,4,True,True,True,1
7,carp,False,False,True,False,False,True,False,True,True,False,False,True,0,True,True,False,4
8,catfish,False,False,True,False,False,True,True,True,True,False,False,True,0,True,False,False,4
9,cavy,True,False,False,True,False,False,False,True,True,True,False,False,4,False,True,False,1


In [10]:
# Drop the columns "catsize" and "domestic" from the dataset.
zoo_data.drop(['catsize', 'domestic'], axis=1, inplace=True)

# Rename the column "breathes" to "air_breather".
zoo_data.rename(columns={'breathes': 'air_breather'}, inplace=True)
zoo_data.head()

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,air_breather,venomous,fins,legs,tail,class_type
0,aardvark,True,False,False,True,False,False,True,True,True,True,False,False,4,False,1
1,antelope,True,False,False,True,False,False,False,True,True,True,False,False,4,True,1
2,bass,False,False,True,False,False,True,True,True,True,False,False,True,0,True,4
3,bear,True,False,False,True,False,False,True,True,True,True,False,False,4,False,1
4,boar,True,False,False,True,False,False,True,True,True,True,False,False,4,True,1


In [11]:
# Create a new column called "water_breather" and set it to True if the animal breathes water.
zoo_data['water_breather'] = np.where(zoo_data['class_type'].isin([4, 5]), True, False)
zoo_data.head()



Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,air_breather,venomous,fins,legs,tail,class_type,water_breather
0,aardvark,True,False,False,True,False,False,True,True,True,True,False,False,4,False,1,False
1,antelope,True,False,False,True,False,False,False,True,True,True,False,False,4,True,1,False
2,bass,False,False,True,False,False,True,True,True,True,False,False,True,0,True,4,True
3,bear,True,False,False,True,False,False,True,True,True,True,False,False,4,False,1,False
4,boar,True,False,False,True,False,False,True,True,True,True,False,False,4,True,1,False


In [12]:
# Create a new column order
reorder_columns = ['animal_name', 'class_type', 'hair', 'feathers', 'eggs'
                   , 'milk', 'airborne', 'aquatic', 'predator', 'toothed'
                    , 'backbone', 'air_breather', 'water_breather', 'venomous'
                    , 'fins', 'tail', 'legs']

# Reorder the columns
zoo_data = zoo_data.reindex(columns=reorder_columns)

# Print the updated DataFrame
zoo_data.head()


Unnamed: 0,animal_name,class_type,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,air_breather,water_breather,venomous,fins,tail,legs
0,aardvark,1,True,False,False,True,False,False,True,True,True,True,False,False,False,False,4
1,antelope,1,True,False,False,True,False,False,False,True,True,True,False,False,False,True,4
2,bass,4,False,False,True,False,False,True,True,True,True,False,True,False,True,True,0
3,bear,1,True,False,False,True,False,False,True,True,True,True,False,False,False,False,4
4,boar,1,True,False,False,True,False,False,True,True,True,True,False,False,False,True,4


In [13]:
# Add a new row to the DataFrame.
new_row = {'animal_name': 'salamander', 'class_type': 5, 'hair': False, 'feathers': False, 'eggs': True, 'milk': False, 'airborne': False, 'aquatic': True, 'predator': True, 'toothed': True, 'backbone': True, 'air_breather': True, 'water_breather': True, 'venomous': False, 'fins': False, 'tail': True, 'legs': 4}
zoo_data = pd.concat([zoo_data, pd.DataFrame(new_row, index=[0])], ignore_index=True)
zoo_data.tail()


Unnamed: 0,animal_name,class_type,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,air_breather,water_breather,venomous,fins,tail,legs
95,wasp,6,True,False,True,False,True,False,False,False,False,True,False,True,False,False,6
96,wolf,1,True,False,False,True,False,False,True,True,True,True,False,False,False,True,4
97,worm,7,False,False,True,False,False,False,False,False,False,True,False,False,False,False,0
98,wren,2,False,True,True,False,True,False,False,False,True,True,False,False,False,True,2
99,salamander,5,False,False,True,False,False,True,True,True,True,True,True,False,False,True,4


In [14]:
# Save the cleaned data to a file called "zoo_cleaned.csv".
zoo_data.to_csv('Clean_Data/zoo_cleaned.csv', index=False)
