# Penguines dataset taken randomly from kaggle

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("penguins.csv")

In [3]:
data.head(1)

Unnamed: 0,id,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007


In [4]:
#number of columns
data.columns

Index(['id', 'species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex', 'year'],
      dtype='object')

In [6]:
#datatypes of the columns
data.dtypes

id                     int64
species               object
island                object
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
year                   int64
dtype: object

In [7]:
#dropped 'id' column beacuse the index is already given
data.drop(['id'],axis = 1,inplace = True)

In [8]:
data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


# we'll start with the 1st column and check for categorical and numerical features
#1) Species


In [10]:
data['species'].unique()

array(['Adelie', 'Gentoo', 'Chinstrap'], dtype=object)

In [11]:
#there are only 3 categories for species so we encode it
data['species'] = data['species'].map({'Adelie':0,'Gentoo':1,'Chinstrap':2})

In [16]:
#no null values
data['species'].isnull().sum()

0

# 2nd column

In [13]:
data['island'].unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [14]:
data['island'] = data['island'].map({'Torgersen':0,'Biscoe':1,'Dream':2})

In [15]:
data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,0,0,39.1,18.7,181.0,3750.0,male,2007
1,0,0,39.5,17.4,186.0,3800.0,female,2007
2,0,0,40.3,18.0,195.0,3250.0,female,2007
3,0,0,,,,,,2007
4,0,0,36.7,19.3,193.0,3450.0,female,2007


In [17]:
#no null values
data['island'].isnull().sum()

0

# 3rd column

In [19]:
data['bill_length_mm'].isnull().sum()
#only 2 Nan / null values are there so we use mode to fill the null values

2

In [21]:
data['bill_length_mm'].mode()[0]
#usage of mode

41.1

In [22]:
data['bill_length_mm'] = data['bill_length_mm'].fillna(data['bill_length_mm'].mode()[0])
#filling the missing values with mode

In [23]:
#zero null values
data['bill_length_mm'].isnull().sum()

0

# 4th column

In [25]:
data['bill_depth_mm'].isnull().sum()

2

In [26]:
#even here there is two missing values we handle it by mode


In [27]:
data['bill_depth_mm'].mode()[0]

17.0

In [28]:
data['bill_depth_mm'] = data['bill_depth_mm'].fillna(data['bill_depth_mm'].mode()[0])

In [29]:
data['bill_depth_mm'].isnull().sum()

0

In [30]:
#done with handling missing values

# 5th column

In [31]:
data['flipper_length_mm'].isnull().sum()
#same techniwue goes here as well

2

In [32]:
data['flipper_length_mm'].mode()[0]

190.0

In [33]:
data['flipper_length_mm'] = data['flipper_length_mm'].fillna(data['flipper_length_mm'].mode()[0])

# 6th column

In [34]:
data['body_mass_g'].isnull().sum()

2

In [35]:
data['body_mass_g'] = data['body_mass_g'].fillna(data['body_mass_g'].mode()[0])

# 7th column

In [39]:
data['sex'] = data['sex'].fillna(data['sex'].mode()[0])

In [40]:
data['sex'].unique()

array(['male', 'female'], dtype=object)

In [41]:
data['sex'] = data['sex'].map({'male':1,'female':0})

# 8th column

In [42]:
data['year'].unique()

array([2007, 2008, 2009], dtype=int64)

In [43]:
data['year'] = data['year'].map({2007 : 0,2008:1,2009:2})

In [44]:
data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,0,0,39.1,18.7,181.0,3750.0,1,0
1,0,0,39.5,17.4,186.0,3800.0,0,0
2,0,0,40.3,18.0,195.0,3250.0,0,0
3,0,0,41.1,17.0,190.0,3800.0,1,0
4,0,0,36.7,19.3,193.0,3450.0,0,0


In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    int64  
 1   island             344 non-null    int64  
 2   bill_length_mm     344 non-null    float64
 3   bill_depth_mm      344 non-null    float64
 4   flipper_length_mm  344 non-null    float64
 5   body_mass_g        344 non-null    float64
 6   sex                344 non-null    int64  
 7   year               344 non-null    int64  
dtypes: float64(4), int64(4)
memory usage: 21.6 KB


In [46]:
data.describe()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
count,344.0,344.0,344.0,344.0,344.0,344.0,344.0,344.0
mean,0.755814,1.209302,43.905523,17.150291,200.851744,4199.418605,0.520349,1.02907
std,0.762626,0.68497,5.447882,1.969061,14.045266,800.197923,0.500313,0.818356
min,0.0,0.0,32.1,13.1,172.0,2700.0,0.0,0.0
25%,0.0,1.0,39.275,15.6,190.0,3550.0,0.0,0.0
50%,1.0,1.0,44.25,17.3,197.0,4025.0,1.0,1.0
75%,1.0,2.0,48.5,18.7,213.0,4750.0,1.0,2.0
max,2.0,2.0,59.6,21.5,231.0,6300.0,1.0,2.0
