#########################################

Import Libraries

#########################################


In [85]:
import pandas as pd
import seaborn as sns
import numpy as np

#########################################

Import Data

#########################################


In [86]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [87]:
print("Count of distinct values in column 'island':\n")
df['island'].value_counts()

Count of distinct values in column 'island':



Biscoe       168
Dream        124
Torgersen     52
Name: island, dtype: int64

#########################################

Drop Useless Information

#########################################

In [88]:
df = df.drop(['bill_depth_mm'], axis=1)
df.head()

Unnamed: 0,species,island,bill_length_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,195.0,3250.0,Female
3,Adelie,Torgersen,,,,
4,Adelie,Torgersen,36.7,193.0,3450.0,Female


#########################################

Rebuild Missing Data

#########################################

In [89]:
print("Columns cointaining 'na' values:\n")
print(df.isna().any())
print('\n----------------------------------------------------\n')
print("Count of 'na' values per column:\n")
print(df.isna().sum())


Columns cointaining 'na' values:

species              False
island               False
bill_length_mm        True
flipper_length_mm     True
body_mass_g           True
sex                   True
dtype: bool

----------------------------------------------------

Count of 'na' values per column:

species               0
island                0
bill_length_mm        2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64


In [90]:
df['bill_length_mm'] = df['bill_length_mm'].fillna(df['bill_length_mm'].mean())
print("Count of 'na' values per column:\n")
print(df.isna().sum())

Count of 'na' values per column:

species               0
island                0
bill_length_mm        0
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64


#########################################

Removing Duplicate Data

#########################################

In [91]:
print("Detected " + str(df[df.duplicated()].shape[0]) + " duplicated row(s):\n ")
df[df.duplicated()]

Detected 1 duplicated row(s):
 


Unnamed: 0,species,island,bill_length_mm,flipper_length_mm,body_mass_g,sex
206,Chinstrap,Dream,42.5,187.0,3350.0,Female


In [92]:
df = df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
print("Duplicated row(s) after cleaning: " + str(df[df.duplicated()].shape[0]))

Duplicated row(s) after cleaning: 0


#########################################

Data Enrichment

#########################################

In [93]:
# replace specific values on a specific column with a dedicated value
df['island'] = df['island'].replace(['Biscoe', 'Dream', 'Torgersen'], ['Sicilia','Sardegna','Corsica'])
df['island'].value_counts()

Sicilia     168
Sardegna    123
Corsica      52
Name: island, dtype: int64

In [94]:
# rename column
df = df.rename(columns={"island": "italian_island"})

In [95]:
# Encoding Categorical Variables
df = pd.get_dummies(df, columns=['italian_island'])
df.head()

Unnamed: 0,species,bill_length_mm,flipper_length_mm,body_mass_g,sex,italian_island_Corsica,italian_island_Sardegna,italian_island_Sicilia
0,Adelie,39.1,181.0,3750.0,Male,1,0,0
1,Adelie,39.5,186.0,3800.0,Female,1,0,0
2,Adelie,40.3,195.0,3250.0,Female,1,0,0
3,Adelie,43.92193,,,,1,0,0
4,Adelie,36.7,193.0,3450.0,Female,1,0,0
