# Zoo Extended List Data Clean

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
from pathlib import Path

# Files to Load 
zoo_data_load = Path("Resources/zoo_extended_list.csv")

# Read Zoo Data File and store into Pandas DataFrame
zoo_data = pd.read_csv(zoo_data_load)
zoo_data.head()

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [2]:
# Make all animal names lower case.
zoo_data['animal_name'] = zoo_data['animal_name'].str.lower()


In [3]:
# Find duplicates in the dataset.
duplicates = zoo_data[zoo_data.duplicated(subset='animal_name')]
print(duplicates)


    animal_name  hair  feathers  eggs  milk  airborne  aquatic  predator  \
26         frog     0         0     1     0         0        1         1   
123     vulture     0         1     1     0         1        0         1   
145     dolphin     0         0     0     1         0        1         1   
146    porpoise     0         0     0     1         0        1         1   
152     buffalo     1         0     0     1         0        0         0   
..          ...   ...       ...   ...   ...       ...      ...       ...   
470         yak     1         0     0     1         0        0         1   
471  fennec_fox     1         0     0     1         0        0         1   
472       tiger     1         0     0     1         0        0         1   
473   red_panda     1         0     0     1         0        0         1   
474      alpaca     1         0     0     1         0        0         0   

     toothed  backbone  breathes  venomous  fins  legs  tail  domestic  \
26         1 

In [4]:
# Drop duplicates
zoo_data.drop_duplicates(subset='animal_name', inplace=True)
zoo_data.head(30)

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
5,buffalo,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
6,calf,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1,1
7,carp,0,0,1,0,0,1,0,1,1,0,0,1,0,1,1,0,4
8,catfish,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
9,cavy,1,0,0,1,0,0,0,1,1,1,0,0,4,0,1,0,1


In [5]:
# Remove the vampire from the dataset.
zoo_data = zoo_data[zoo_data['animal_name'] != 'vampire']


In [6]:
# Rename the animal_name "girl" to "human".
zoo_data.loc[zoo_data['animal_name'] == 'girl', 'animal_name'] = 'human'
zoo_data.head(30)

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
5,buffalo,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
6,calf,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1,1
7,carp,0,0,1,0,0,1,0,1,1,0,0,1,0,1,1,0,4
8,catfish,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
9,cavy,1,0,0,1,0,0,0,1,1,1,0,0,4,0,1,0,1


In [7]:
# Check the datatypes of the dataset.
zoo_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 276 entries, 0 to 466
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   animal_name  276 non-null    object
 1   hair         276 non-null    int64 
 2   feathers     276 non-null    int64 
 3   eggs         276 non-null    int64 
 4   milk         276 non-null    int64 
 5   airborne     276 non-null    int64 
 6   aquatic      276 non-null    int64 
 7   predator     276 non-null    int64 
 8   toothed      276 non-null    int64 
 9   backbone     276 non-null    int64 
 10  breathes     276 non-null    int64 
 11  venomous     276 non-null    int64 
 12  fins         276 non-null    int64 
 13  legs         276 non-null    int64 
 14  tail         276 non-null    int64 
 15  domestic     276 non-null    int64 
 16  catsize      276 non-null    int64 
 17  class_type   276 non-null    int64 
dtypes: int64(17), object(1)
memory usage: 41.0+ KB


In [8]:
zoo_data.count()

animal_name    276
hair           276
feathers       276
eggs           276
milk           276
airborne       276
aquatic        276
predator       276
toothed        276
backbone       276
breathes       276
venomous       276
fins           276
legs           276
tail           276
domestic       276
catsize        276
class_type     276
dtype: int64

In [9]:
# Change the datatype of the boolean columns to bool.
boolean_columns = ['hair', 'feathers', 'eggs', 'milk', 'airborne', 'aquatic', 'predator', 'toothed', 'backbone', 'breathes', 'venomous', 'fins', 'tail', 'domestic', 'catsize']
zoo_data[boolean_columns] = zoo_data[boolean_columns].astype(bool)


In [10]:
zoo_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 276 entries, 0 to 466
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   animal_name  276 non-null    object
 1   hair         276 non-null    bool  
 2   feathers     276 non-null    bool  
 3   eggs         276 non-null    bool  
 4   milk         276 non-null    bool  
 5   airborne     276 non-null    bool  
 6   aquatic      276 non-null    bool  
 7   predator     276 non-null    bool  
 8   toothed      276 non-null    bool  
 9   backbone     276 non-null    bool  
 10  breathes     276 non-null    bool  
 11  venomous     276 non-null    bool  
 12  fins         276 non-null    bool  
 13  legs         276 non-null    int64 
 14  tail         276 non-null    bool  
 15  domestic     276 non-null    bool  
 16  catsize      276 non-null    bool  
 17  class_type   276 non-null    int64 
dtypes: bool(15), int64(2), object(1)
memory usage: 12.7+ KB


In [11]:
# Drop the columns "catsize" and "domestic" from the dataset.
zoo_data.drop(['catsize', 'domestic'], axis=1, inplace=True)

# Rename the column "breathes" to "air_breather".
zoo_data.rename(columns={'breathes': 'air_breather'}, inplace=True)
zoo_data.head()

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,air_breather,venomous,fins,legs,tail,class_type
0,aardvark,True,False,False,True,False,False,True,True,True,True,False,False,4,False,1
1,antelope,True,False,False,True,False,False,False,True,True,True,False,False,4,True,1
2,bass,False,False,True,False,False,True,True,True,True,False,False,True,0,True,4
3,bear,True,False,False,True,False,False,True,True,True,True,False,False,4,False,1
4,boar,True,False,False,True,False,False,True,True,True,True,False,False,4,True,1


In [12]:
# Create a new column called "water_breather" and set it to True if the animal breathes water.
zoo_data['water_breather'] = np.where(zoo_data['class_type'].isin([4, 5]), True, False)
zoo_data.head()

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,air_breather,venomous,fins,legs,tail,class_type,water_breather
0,aardvark,True,False,False,True,False,False,True,True,True,True,False,False,4,False,1,False
1,antelope,True,False,False,True,False,False,False,True,True,True,False,False,4,True,1,False
2,bass,False,False,True,False,False,True,True,True,True,False,False,True,0,True,4,True
3,bear,True,False,False,True,False,False,True,True,True,True,False,False,4,False,1,False
4,boar,True,False,False,True,False,False,True,True,True,True,False,False,4,True,1,False


In [13]:
# Reorder the DataFrame by animal_name.
zoo_data = zoo_data.sort_values(by='animal_name')
zoo_data.head()


Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,air_breather,venomous,fins,legs,tail,class_type,water_breather
0,aardvark,True,False,False,True,False,False,True,True,True,True,False,False,4,False,1,False
101,aardwolf,True,False,False,True,False,False,True,True,True,True,False,False,4,True,1,False
134,african_elephant,True,False,False,True,False,False,False,True,True,True,False,False,4,True,1,False
235,alligator,True,False,True,False,False,True,True,True,True,True,False,True,4,True,3,False
102,alpaca,True,False,False,True,False,False,False,True,True,True,False,False,4,True,1,False


In [14]:
# Create a new column order
reorder_columns = ['animal_name', 'class_type', 'hair', 'feathers', 'eggs'
                   , 'milk', 'airborne', 'aquatic', 'predator', 'toothed'
                    , 'backbone', 'air_breather', 'water_breather', 'venomous'
                    , 'fins', 'tail', 'legs']

# Reorder the columns
zoo_data = zoo_data.reindex(columns=reorder_columns)

# Print the updated DataFrame
zoo_data.head()

Unnamed: 0,animal_name,class_type,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,air_breather,water_breather,venomous,fins,tail,legs
0,aardvark,1,True,False,False,True,False,False,True,True,True,True,False,False,False,False,4
101,aardwolf,1,True,False,False,True,False,False,True,True,True,True,False,False,False,True,4
134,african_elephant,1,True,False,False,True,False,False,False,True,True,True,False,False,False,True,4
235,alligator,3,True,False,True,False,False,True,True,True,True,True,False,False,True,True,4
102,alpaca,1,True,False,False,True,False,False,False,True,True,True,False,False,False,True,4


In [15]:
# Reset the index of the DataFrame.
zoo_data = zoo_data.reset_index(drop=True)
zoo_data.head(30)

Unnamed: 0,animal_name,class_type,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,air_breather,water_breather,venomous,fins,tail,legs
0,aardvark,1,True,False,False,True,False,False,True,True,True,True,False,False,False,False,4
1,aardwolf,1,True,False,False,True,False,False,True,True,True,True,False,False,False,True,4
2,african_elephant,1,True,False,False,True,False,False,False,True,True,True,False,False,False,True,4
3,alligator,3,True,False,True,False,False,True,True,True,True,True,False,False,True,True,4
4,alpaca,1,True,False,False,True,False,False,False,True,True,True,False,False,False,True,4
5,antelope,1,True,False,False,True,False,False,False,True,True,True,False,False,False,True,4
6,armadillo,1,True,False,True,False,False,False,True,True,True,True,False,False,False,True,4
7,asian_elephant,1,True,False,False,True,False,False,False,True,True,True,False,False,False,True,4
8,axolotl,5,False,False,True,False,False,True,True,True,True,True,True,False,False,False,4
9,aye-aye,1,True,False,True,False,False,False,True,True,True,True,False,False,False,True,4


In [16]:
# Export the cleaned data to a new CSV file.
zoo_data.to_csv('Clean_Data/zoo_extended_list_cleaned.csv', index=False)