<h1> Palmer Penguin Data Analytics</h1>
<h4>Analyst: John Paul Cortes</h4>
<hr>


In [1]:
import numpy as np
import pandas as pd
import warnings
from scipy.stats import f_oneway
warnings.filterwarnings('ignore')

<h2>Importing Data Sets and Converting Data Sets to Dataframes</h2>
<hr>
    <ul><b>Requirements</ul>
        <li>Name of the Dataframe</li>
        <li>Source</li>

In [2]:
penguins = pd.read_csv('DATASETS\\penguins.csv')

In [3]:
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


<h2> Describing the Data</h2>
<hr>

In [4]:
penguins.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year
count,342.0,342.0,342.0,342.0,344.0
mean,43.92193,17.15117,200.915205,4201.754386,2008.02907
std,5.459584,1.974793,14.061714,801.954536,0.818356
min,32.1,13.1,172.0,2700.0,2007.0
25%,39.225,15.6,190.0,3550.0,2007.0
50%,44.45,17.3,197.0,4050.0,2008.0
75%,48.5,18.7,213.0,4750.0,2009.0
max,59.6,21.5,231.0,6300.0,2009.0


<h2>Transposing the Data</h2>
<hr>

In [5]:
penguins.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
bill_length_mm,342.0,43.92193,5.459584,32.1,39.225,44.45,48.5,59.6
bill_depth_mm,342.0,17.15117,1.974793,13.1,15.6,17.3,18.7,21.5
flipper_length_mm,342.0,200.915205,14.061714,172.0,190.0,197.0,213.0,231.0
body_mass_g,342.0,4201.754386,801.954536,2700.0,3550.0,4050.0,4750.0,6300.0
year,344.0,2008.02907,0.818356,2007.0,2007.0,2008.0,2009.0,2009.0


<h2>Checking the Null Values</h2>
<hr>

In [6]:
penguins.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

<h2>Filtering the Data</h2>
<hr>

In [7]:
chinstrap_df = penguins[penguins['species'] == "Chinstrap"]
chinstrap_df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
276,Chinstrap,Dream,46.5,17.9,192.0,3500.0,female,2007
277,Chinstrap,Dream,50.0,19.5,196.0,3900.0,male,2007
278,Chinstrap,Dream,51.3,19.2,193.0,3650.0,male,2007
279,Chinstrap,Dream,45.4,18.7,188.0,3525.0,female,2007
280,Chinstrap,Dream,52.7,19.8,197.0,3725.0,male,2007
...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


In [8]:
chinstrap_heavy = chinstrap_df[chinstrap_df['body_mass_g'] >= 4000]
chinstrap_heavy

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
284,Chinstrap,Dream,46.0,18.9,195.0,4150.0,female,2007
289,Chinstrap,Dream,52.0,18.1,201.0,4050.0,male,2007
291,Chinstrap,Dream,50.5,19.6,201.0,4050.0,male,2007
295,Chinstrap,Dream,49.2,18.2,195.0,4400.0,male,2007
301,Chinstrap,Dream,52.0,19.0,197.0,4150.0,male,2007
305,Chinstrap,Dream,52.8,20.0,205.0,4550.0,male,2008
307,Chinstrap,Dream,54.2,20.8,201.0,4300.0,male,2008
309,Chinstrap,Dream,51.0,18.8,203.0,4100.0,male,2008
313,Chinstrap,Dream,52.0,20.7,210.0,4800.0,male,2008
315,Chinstrap,Dream,53.5,19.9,205.0,4500.0,male,2008


In [9]:
chinstrap_heavy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16 entries, 284 to 342
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            16 non-null     object 
 1   island             16 non-null     object 
 2   bill_length_mm     16 non-null     float64
 3   bill_depth_mm      16 non-null     float64
 4   flipper_length_mm  16 non-null     float64
 5   body_mass_g        16 non-null     float64
 6   sex                16 non-null     object 
 7   year               16 non-null     int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 1.1+ KB


<h2>Checking the Null Values and Filling it up</h2>
<hr>

In [10]:
penguins.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [11]:
penguins['bill_length_mm'] = penguins['bill_length_mm'].fillna(penguins['bill_length_mm'].mean())
penguins.isnull().sum()

species               0
island                0
bill_length_mm        0
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [12]:
penguins['bill_depth_mm'] = penguins['bill_depth_mm'].fillna(penguins['bill_depth_mm'].mean())
penguins.isnull().sum()

species               0
island                0
bill_length_mm        0
bill_depth_mm         0
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [13]:
penguins['flipper_length_mm'] = penguins['flipper_length_mm'].fillna(penguins['flipper_length_mm'].mean())
penguins.isnull().sum()

species               0
island                0
bill_length_mm        0
bill_depth_mm         0
flipper_length_mm     0
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [14]:
penguins['body_mass_g'] = penguins['body_mass_g'].fillna(penguins['body_mass_g'].mean())
penguins.isnull().sum()

species               0
island                0
bill_length_mm        0
bill_depth_mm         0
flipper_length_mm     0
body_mass_g           0
sex                  11
year                  0
dtype: int64

In [15]:
penguins['sex'] = penguins['sex'].fillna(penguins['sex'].mode()[0])
penguins.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
year                 0
dtype: int64

<h2>How to Change the Data Types Column</h2>
<hr>

In [16]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     344 non-null    float64
 3   bill_depth_mm      344 non-null    float64
 4   flipper_length_mm  344 non-null    float64
 5   body_mass_g        344 non-null    float64
 6   sex                344 non-null    object 
 7   year               344 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 21.6+ KB


In [17]:
penguins['species'] = penguins['species'].astype('category')
penguins['island'] = penguins['island'].astype('category')
penguins['sex'] = penguins['sex'].astype('category')
penguins['year'] = penguins['year'].astype('category')

<h2>Result or Final Column</h2>
<hr>

In [18]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   species            344 non-null    category
 1   island             344 non-null    category
 2   bill_length_mm     344 non-null    float64 
 3   bill_depth_mm      344 non-null    float64 
 4   flipper_length_mm  344 non-null    float64 
 5   body_mass_g        344 non-null    float64 
 6   sex                344 non-null    category
 7   year               344 non-null    category
dtypes: category(4), float64(4)
memory usage: 12.7 KB


<h2>Creating New Column</h2>

In [19]:
penguins['body_mass_kg'] = round(penguins['body_mass_g']/1000,1)

In [20]:
penguins.describe

<bound method NDFrame.describe of        species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0       Adelie  Torgersen        39.10000       18.70000         181.000000   
1       Adelie  Torgersen        39.50000       17.40000         186.000000   
2       Adelie  Torgersen        40.30000       18.00000         195.000000   
3       Adelie  Torgersen        43.92193       17.15117         200.915205   
4       Adelie  Torgersen        36.70000       19.30000         193.000000   
..         ...        ...             ...            ...                ...   
339  Chinstrap      Dream        55.80000       19.80000         207.000000   
340  Chinstrap      Dream        43.50000       18.10000         202.000000   
341  Chinstrap      Dream        49.60000       18.20000         193.000000   
342  Chinstrap      Dream        50.80000       19.00000         210.000000   
343  Chinstrap      Dream        50.20000       18.70000         198.000000   

     body_mass_g 

<h2>Exploratory Data Analytics (EDA)</h2>
<hr>

<h4> 1. What's the average body mass of each species?</h4>

In [21]:
chinstrap_df = penguins[penguins['species'] == 'Chinstrap'] 
chinstrap_df['body_mass_g'].mean()


np.float64(3733.0882352941176)

In [22]:
gentoo_df = penguins[penguins['species'] == 'Gentoo'] 
gentoo_df['body_mass_g'].mean()


np.float64(5068.965761177136)

In [23]:
adelie_df = penguins[penguins['species'] == 'Adelie'] 
adelie_df['body_mass_g'].mean()

np.float64(3703.95891043398)

<h4>Insight</h4>
<ul><b>The average mass of each species are;</ul>
<li>Chinstrap: 3733.0882352941176</li>
<li>Gentoo: 5068.965761177136 </li>
<li>Adelie: 3703.95891043398</li>
<hr>

<h4>2. What's the average flipper length of all species? </h4>

penguins['flipper_length_mm'].mean()

<h4>Insight</h4>
<p>The average flipper length of all penguin species is 200.91520467836258</p>
<hr>

<h4>3. Which species has the longest bill? </h4>

In [24]:
adelie_df['bill_length_mm'].mean()

np.float64(38.82514427516159)

In [25]:
chinstrap_df['bill_length_mm'].mean()

np.float64(48.83382352941177)

In [26]:
gentoo_df['bill_length_mm'].mean()

np.float64(47.475983305036785)

<h4>Insight</h4>
<p>The species with the longest bill is Chinstrap with the length of 48.83382352941177</p>
<hr>

<h4>4. How many were the recorded penguins from each year from 2007 to 2009? </h4>

In [27]:
penguins['year'].value_counts()

year
2009    120
2008    114
2007    110
Name: count, dtype: int64

<h4>Insights</h4>
<p>The numbers of penguins that were recorded in 2007 were 110, in 2008 were 114, and lastly in 2009 there were 120 penguins.  </p>
<hr>

<h4>5. How many male and female are there in each of the island? </h4>

In [28]:
peng_d = penguins[penguins['island'] == "Dream"]
peng_d['sex'].value_counts()

sex
male      63
female    61
Name: count, dtype: int64

In [29]:
peng_t = penguins[penguins['island'] == "Torgersen"]
peng_t['sex'].value_counts()

sex
male      28
female    24
Name: count, dtype: int64

In [30]:
peng_b = penguins[penguins['island'] == "Biscoe"]
peng_b['sex'].value_counts()

sex
male      88
female    80
Name: count, dtype: int64

<h4>Insights</h4>
<p>The numbers of male and female from each of the island;
Torgersen: male      28
           female    24 
Biscoe:    male      88
           female    80
Dream:     male      63
           female    61</p>
<hr>

<h4>6. What's the average bill depth of each of species and who has the deepest bill? </h4>

In [31]:
chinstrap_df = penguins[penguins['species'] == 'Chinstrap'] 
chinstrap_df['bill_depth_mm'].mean()


np.float64(18.42058823529412)

In [32]:
gentoo_df = penguins[penguins['species'] == 'Gentoo'] 
gentoo_df['bill_depth_mm'].mean()

np.float64(14.999606206376153)

In [33]:
adelie_df = penguins[penguins['species'] == 'Adelie'] 
adelie_df['bill_depth_mm'].mean()

np.float64(18.338494536780548)

<h4

<h4> Insights</h4>
<ul><b>The average bill depth of each species;</ul>
<li>Chinstrap: 18.42058823529412</li>
<li>Gentoo: 14.999606206376153</li>
<li>Adelie: 18.338494536780548</li>
<p>Based from this analysis the specie that has the deepest bill is Chinstrap.</p>
<hr>

<h4> 7. How many Chinstrap, Gentoo, and Adelie are there in each year?</h4>

In [34]:
chinstrap_df['year'].value_counts()

year
2007    26
2009    24
2008    18
Name: count, dtype: int64

In [35]:
gentoo_df['year'].value_counts()

year
2008    46
2009    44
2007    34
Name: count, dtype: int64

In [36]:
adelie_df['year'].value_counts()

year
2009    52
2007    50
2008    50
Name: count, dtype: int64

<h4>Insights</h4>
<ul>The numbers of species in each year;</ul>
<p>Chinstrap</p>
<li>2007    26</li>
<li>2009    24</li>
<li>2008    18</li>
<p>Gentoo</p>
<li>2008    46</li>
<li>2009    44</li>
<li>2007    34</li>
<p>Adelie</p>
<li>2009    52</li>
<li>2007    50</li>
<li>2008    50</li>
<hr>

<h2>Data Aggregations</h2>
<hr>

<h4>1. How many species were recorded in each year in each of the island? </h4>

In [47]:
total = penguins.groupby(['species', 'island', 'year']).size()
total

species    island     year
Adelie     Biscoe     2007    10
                      2008    18
                      2009    16
           Dream      2007    20
                      2008    16
                      2009    20
           Torgersen  2007    20
                      2008    16
                      2009    16
Chinstrap  Biscoe     2007     0
                      2008     0
                      2009     0
           Dream      2007    26
                      2008    18
                      2009    24
           Torgersen  2007     0
                      2008     0
                      2009     0
Gentoo     Biscoe     2007    34
                      2008    46
                      2009    44
           Dream      2007     0
                      2008     0
                      2009     0
           Torgersen  2007     0
                      2008     0
                      2009     0
dtype: int64

<h4>2. What's the average bill depth of every species in each island from year 2007 to 2009?</h4>

In [38]:
penguins.groupby(['species', 'island', 'year'])['bill_depth_mm'].mean()

species    island     year
Adelie     Biscoe     2007    18.440000
                      2008    18.127778
                      2009    18.600000
           Dream      2007    18.690000
                      2008    18.337500
                      2009    17.745000
           Torgersen  2007    18.927558
                      2008    18.118750
                      2009    18.037500
Chinstrap  Biscoe     2007          NaN
                      2008          NaN
                      2009          NaN
           Dream      2007    18.484615
                      2008    18.450000
                      2009    18.329167
           Torgersen  2007          NaN
                      2008          NaN
                      2009          NaN
Gentoo     Biscoe     2007    14.688235
                      2008    14.923913
                      2009    15.319345
           Dream      2007          NaN
                      2008          NaN
                      2009          NaN
           To

<h4>3. What sex has the heaviest weight?</h4>

In [39]:
penguins.groupby(['sex'])['body_mass_g'].mean()

sex
female    3862.272727
male      4514.684407
Name: body_mass_g, dtype: float64

<h2>Pearson R Correlation</h2>
<hr>
<p>Used for testing correlation between two <strong>NUMERICAL VALUE</strong></p>

<h4>4. Does the bill length of penguins have relation to the bill depth? </h4>

In [40]:
penguins['bill_length_mm'].corr(penguins['bill_depth_mm'])

np.float64(-0.23505287035553282)

<h2>Numerical - Categorical Correlation Test</h2>
<h4>Library - SpiPy(Scientific Python)</h4>
<hr>
<p>from scipy.stats import f_oneway</p>


<h4>5.Do the different penguin species have the same mean body mass?</h4>

In [42]:
f_oneway(penguins[penguins['species'] == 'Adelie']['body_mass_g'], penguins[penguins['species'] == 'Chinstrap']['body_mass_g'], penguins[penguins['species'] == 'Gentoo']['body_mass_g'])

F_onewayResult(statistic=np.float64(338.57291007822204), pvalue=np.float64(1.0069770504490732e-81))

<h4>Insight</h4>
<p>All penguin species have the same mean body mass because it is greater than 0.5.</p>
<hr>

<h4>6. Are Gentoo penguins both heavier and longer-flippered than Adelie and Chinstrap? </h4>

In [69]:
means = penguins.groupby("species")[["body_mass_g", "flipper_length_mm"]].mean()
means


Unnamed: 0_level_0,body_mass_g,flipper_length_mm
species,Unnamed: 1_level_1,Unnamed: 2_level_1
Adelie,3703.95891,190.025758
Chinstrap,3733.088235,195.823529
Gentoo,5068.965761,217.055768


In [73]:
gentoo_mass = means.loc["Gentoo", "body_mass_g"]
gentoo_flipper = means.loc["Gentoo", "flipper_length_mm"]

adelie_mass = means.loc["Adelie", "body_mass_g"]
chinstrap_mass = means.loc["Chinstrap", "body_mass_g"]

adelie_flipper = means.loc["Adelie", "flipper_length_mm"]
chinstrap_flipper = means.loc["Chinstrap", "flipper_length_mm"]

print("Is Gentoo heavier than Adelie and Chinstrap?",
      gentoo_mass > adelie_mass and gentoo_mass > chinstrap_mass)

print("Is Gentoo longer-flippered than Adelie and Chinstrap?",
      gentoo_flipper > adelie_flipper and gentoo_flipper > chinstrap_flipper)

Is Gentoo heavier than Adelie and Chinstrap? True
Is Gentoo longer-flippered than Adelie and Chinstrap? True
