In [19]:
import os,sys,subprocess,time
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import numpy as np
import pandas as pd
pd.set_option("display.float_format","{:.2f}".format)
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
pio.templates["mod"] = go.layout.Template(layout=dict(font=dict(family="Fira Code",size=20)))
pio.templates.default = "plotly_dark+mod"
from zipfile import ZipFile
from glob import glob
from scipy import stats
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,MinMaxScaler,RobustScaler,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV,train_test_split,StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score,confusion_matrix,accuracy_score,f1_score,precision_recall_curve
import miceforest as mf
import tensorflow as tf
from tensorflow import keras
tf.get_logger().setLevel('ERROR')
%matplotlib inline

In [2]:
peng_lter = pd.read_csv('penguins_lter.csv')

In [3]:
peng_lter.head(3)

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,11/11/07,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,11/11/07,39.5,17.4,186.0,3800.0,FEMALE,8.95,-24.69,
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,11/16/07,40.3,18.0,195.0,3250.0,FEMALE,8.37,-25.33,


In [4]:
peng_lter.shape

(344, 17)

In [5]:
peng_lter.describe()

Unnamed: 0,Sample Number,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Delta 15 N (o/oo),Delta 13 C (o/oo)
count,344.0,342.0,342.0,342.0,342.0,330.0,331.0
mean,63.15,43.92,17.15,200.92,4201.75,8.73,-25.69
std,40.43,5.46,1.97,14.06,801.95,0.55,0.79
min,1.0,32.1,13.1,172.0,2700.0,7.63,-27.02
25%,29.0,39.23,15.6,190.0,3550.0,8.3,-26.32
50%,58.0,44.45,17.3,197.0,4050.0,8.65,-25.83
75%,95.25,48.5,18.7,213.0,4750.0,9.17,-25.06
max,152.0,59.6,21.5,231.0,6300.0,10.03,-23.79


In [6]:
peng_lter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   studyName            344 non-null    object 
 1   Sample Number        344 non-null    int64  
 2   Species              344 non-null    object 
 3   Region               344 non-null    object 
 4   Island               344 non-null    object 
 5   Stage                344 non-null    object 
 6   Individual ID        344 non-null    object 
 7   Clutch Completion    344 non-null    object 
 8   Date Egg             344 non-null    object 
 9   Culmen Length (mm)   342 non-null    float64
 10  Culmen Depth (mm)    342 non-null    float64
 11  Flipper Length (mm)  342 non-null    float64
 12  Body Mass (g)        342 non-null    float64
 13  Sex                  334 non-null    object 
 14  Delta 15 N (o/oo)    330 non-null    float64
 15  Delta 13 C (o/oo)    331 non-null    flo

As comments column is heavily downsized and missing we can omit it<br>
As this is heavily specie-related dataset lets clean the species column first<br>
since there is only one Region we can drop it<br>
since there is only one Stage we can drop it

In [7]:
def preprocess(data:pd.DataFrame):
   df = data.drop(["Comments"],axis=1)
   df["Species"] = df.Species.str.split(n=1,expand=True)[0]
   df.drop(columns=['Region'],inplace=True)
   df.drop(columns=['Stage'],inplace=True)
   df[['N','A']] =df['Individual ID'].str.split(r'(\d+)',expand=True)[[1,3]]
   df['Date Egg'] = pd.to_datetime(df['Date Egg'],format='mixed')
   df.drop(columns=['Individual ID'],inplace=True)
   original_columns =df.columns.to_list()
   df.columns = ['study','samples','species','island','clutch','date','culmenL','culmenD','flipperL','bmass','sex','delta_15n','delta_13c','N','A']
   df[['N','A']] =df[['N','A']].astype(np.float32)
   return df

<span style="font-family:Fira Code">
<font size=4>

|Column Name|Description|Type|
|:----------|:----------|:---|
|StudyName|Sampling expedition from which data were collected, generated, etc.|categorical|
|SampleNumber|an integer denoting the continuous numbering sequence for each sample|categorical|
|Species|a character string denoting the penguin species|categorical|
|Region|a character string denoting the region of Palmer LTER sampling grid|categorical|
|Island|a character string denoting the island near Palmer Station where samples were collected|categorical|
|Stage|a character string denoting reproductive stage at sampling|categorical|
|IndividualID|a character string denoting the unique ID for each individual in dataset [N,A]|categorical|
|Clutch Completion|a character string denoting if the study nest observed with a full clutch, i.e., 2 eggs|categorical|
|Date Egg|a date denoting the date study nest observed with 1 egg (sampled)|continuous|
|Culmen Length|a number denoting the length of the dorsal ridge of a bird's bill (millimeters)|continuous|
|Culmen Depth|a number denoting the depth of the dorsal ridge of a bird's bill (millimeters)|continuous|
|Flipper Length|an integer denoting the length penguin flipper (millimeters)|continuous|
|Body Mass|an integer denoting the penguin body mass (grams)|continuous|
|Sex|a character string denoting the sex of an animal|categorical|
|Delta 15 N|a number denoting the measure of the ratio of stable isotopes 15N:14N|continuous|
|Delta 13 C|a number denoting the measure of the ratio of stable isotopes 13C:12C|continuous|

In [8]:
temp_df = preprocess(peng_lter)

In [9]:
temp_df.head()

Unnamed: 0,study,samples,species,island,clutch,date,culmenL,culmenD,flipperL,bmass,sex,delta_15n,delta_13c,N,A
0,PAL0708,1,Adelie,Torgersen,Yes,2007-11-11,39.1,18.7,181.0,3750.0,MALE,,,1.0,1.0
1,PAL0708,2,Adelie,Torgersen,Yes,2007-11-11,39.5,17.4,186.0,3800.0,FEMALE,8.95,-24.69,1.0,2.0
2,PAL0708,3,Adelie,Torgersen,Yes,2007-11-16,40.3,18.0,195.0,3250.0,FEMALE,8.37,-25.33,2.0,1.0
3,PAL0708,4,Adelie,Torgersen,Yes,2007-11-16,,,,,,,,2.0,2.0
4,PAL0708,5,Adelie,Torgersen,Yes,2007-11-16,36.7,19.3,193.0,3450.0,FEMALE,8.77,-25.32,3.0,1.0


In [10]:
temp_na = temp_df.dropna()

# Univariate Analysis

## Continuous

### Date

In [11]:
years = temp_na.date.dt.year.unique()
years

array([2007, 2008, 2009], dtype=int32)

There are three years [ 2007  2008  2009 ]

In [80]:
colors_list = px.colors.qualitative.Plotly[:3]
fig = go.Figure()
for y,col in zip(years,colors_list):
    df = temp_na.loc[temp_na['date'].dt.year == y]
    fig.add_trace(go.Scatter(x=df.date,y=[1]*df.shape[0],mode="lines",name=f"{y}",line=dict(color=col)))
    fig.add_trace(go.Scatter(x=np.r_[df.date.min(),df.date.max()],y=[1]*2,mode="markers",name="i",marker=dict(color=col)))
for trace in fig.data:
    if trace['name'] == 'i':
        trace['showlegend'] = False
fig.update_layout(title="Date Range")
fig.update_yaxes(showticklabels=False)
fig.show()

### Physical Attributes

In [88]:
cols = ['culmenL','culmenD','flipperL']
temp_na[cols].describe()

Unnamed: 0,culmenL,culmenD,flipperL
count,325.0,325.0,325.0
mean,44.05,17.12,201.26
std,5.47,1.97,13.96
min,32.1,13.1,172.0
25%,39.5,15.6,190.0
50%,44.9,17.3,197.0
75%,48.7,18.6,213.0
max,59.6,21.5,231.0


In [59]:
def plot_qq(_):
    data = sm.qqplot(temp_na[_],line='s').gca().lines
    plt.close()
    return data

def shapiro_test(_):
    statistic,p_value = stats.shapiro(temp_na[_].to_numpy())
    if p_value > 0.05:
        return p_value,"Shapiro Normality test H0 fail to reject"
    else:
        return p_value,"Shapiro Normality test H0 reject"

In [74]:
fig = make_subplots(rows=3,cols=3,subplot_titles=("Culmen Length(mm)","Culmen Length qq-plot","Shapiro test Culmen Length","Culmen Depth(mm)","Culmen Depth qq-plot","Shapiro test Culmen Depth","Flipper Length(mm)","Flipper Length qq-plot","Shapiro test Flipper Length"))
colors_list = px.colors.qualitative.Plotly[:3]
for i,color_ in enumerate(colors_list):
    d = plot_qq(cols[i])
    p_val,text_ = shapiro_test(cols[i])
    fig.add_trace(go.Histogram(x=temp_na[cols[i]],marker=dict(color=color_)),row=i+1,col=1)
    fig.append_trace(go.Scatter(x=d[0].get_xdata(),y=d[0].get_ydata(),mode="markers",showlegend=False,marker=dict(color=color_)),row=i+1,col=2)
    fig.append_trace(go.Scatter(x=d[1].get_xdata(),y=d[1].get_ydata(),mode="lines",showlegend=False,line=dict(color="white")),row=i+1,col=2)
    fig.add_annotation(text=f"p_value of shapiro: {p_val}<br>{text_}",row=i+1,col=3)
    fig.update_xaxes(showgrid=False,showticklabels=False,row=i+1,col=3)
    fig.update_yaxes(showgrid=False,showticklabels=False,row=i+1,col=3)
fig.update_layout(showlegend=False,height=1000,width=1800,title=dict(text="Physical Attributes Histogram and Q-Q plot",font=dict(size=30)),margin=dict(t=150))
fig.show()

### Experiment Outcomes

In [87]:
cols = ['delta_15n','delta_13c']
temp_na[cols].describe()

Unnamed: 0,delta_15n,delta_13c
count,325.0,325.0
mean,8.74,-25.69
std,0.55,0.79
min,7.63,-27.02
25%,8.3,-26.33
50%,8.66,-25.85
75%,9.18,-25.06
max,10.03,-23.89


<span style="font-family:Fira Code">

$\delta^{15}N$ and $\delta^{13}C$ are the ratio of the the two stable isotopes of the respective elements with respective to atmosphere
- If the number is positive that means relative enrichment to the atmosphere
- If the number is negative that means relative depletion to the atmosphere
- Typical insignificant range is from [-20,80]

In [86]:
# @title
fig = make_subplots(rows=2,cols=3,subplot_titles=("Delta 15N Ratio Histogram","Delta 15N qq-plot","Shapiro test Delta 15N","Delta 13C Histogram","Delta 13C qq-plot","Shapiro test Delta 13C"))
colors_list = px.colors.qualitative.Plotly[:2]
for i,color_ in enumerate(colors_list):
    d = plot_qq(cols[i])
    p_val,text_ = shapiro_test(cols[i])
    fig.add_trace(go.Histogram(x=temp_na[cols[i]],marker=dict(color=color_)),row=i+1,col=1)
    fig.append_trace(go.Scatter(x=d[0].get_xdata(),y=d[0].get_ydata(),mode="markers",showlegend=False,marker=dict(color=color_)),row=i+1,col=2)
    fig.append_trace(go.Scatter(x=d[1].get_xdata(),y=d[1].get_ydata(),mode="lines",showlegend=False,line=dict(color="white")),row=i+1,col=2)
    fig.add_annotation(text=f"p_value of shapiro: {p_val}<br>{text_}",row=i+1,col=3)
    fig.update_xaxes(showgrid=False,showticklabels=False,row=i+1,col=3)
    fig.update_yaxes(showgrid=False,showticklabels=False,row=i+1,col=3)
fig.update_layout(showlegend=False,height=1000,width=1800,title=dict(text="Physical Attributes Histogram and Q-Q plot",font=dict(size=30)),margin=dict(t=150))
fig.show()

In [15]:
# ord_enc = OrdinalEncoder().set_output(transform='pandas')
# temp_df[['study','species','island','clutch','sex']] = ord_enc.fit_transform(temp_df[['study','species','island','clutch','sex']])
# temp_df.head()
# kds = mf.ImputationKernel(data=temp_df.drop(columns=['date']),datasets=5,random_state=1991)
# kds.mice(1)
# plt.rcParams["figure.figsize"] = (20,10)
# kds.plot_imputed_distributions(datasets=1)
# temp_df = kds.complete_data(dataset=1)
# GradientBoostingClassifier().get_params()
# gird_params = dict(
#     learnig_rate=np.logspace(-6,-1,6),
#     max_depth=np.arange(3,16),
#     max_leaf_nodes=np.arange(8,32),
#     n_estimators=np.arange(100,500,50)
#     )