In [1]:
import os,sys,subprocess,time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
pio.templates["mod"] = go.layout.Template(layout=dict(font=dict(family="Fira Code")))
pio.templates.default = "plotly_dark+mod"
from zipfile import ZipFile
from glob import glob
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,MinMaxScaler,RobustScaler,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV,train_test_split,StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score,confusion_matrix,accuracy_score,f1_score,precision_recall_curve
import miceforest as mf
import tensorflow as tf
from tensorflow import keras
%matplotlib inline

2023-08-06 16:11:52.905622: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
peng_lter = pd.read_csv('penguins_lter.csv')

In [3]:
peng_lter.head(3)

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,11/11/07,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,11/11/07,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454,
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,11/16/07,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302,


In [4]:
peng_lter.shape

(344, 17)

In [5]:
peng_lter.describe()

Unnamed: 0,Sample Number,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Delta 15 N (o/oo),Delta 13 C (o/oo)
count,344.0,342.0,342.0,342.0,342.0,330.0,331.0
mean,63.151163,43.92193,17.15117,200.915205,4201.754386,8.733382,-25.686292
std,40.430199,5.459584,1.974793,14.061714,801.954536,0.55177,0.793961
min,1.0,32.1,13.1,172.0,2700.0,7.6322,-27.01854
25%,29.0,39.225,15.6,190.0,3550.0,8.29989,-26.320305
50%,58.0,44.45,17.3,197.0,4050.0,8.652405,-25.83352
75%,95.25,48.5,18.7,213.0,4750.0,9.172123,-25.06205
max,152.0,59.6,21.5,231.0,6300.0,10.02544,-23.78767


In [6]:
peng_lter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   studyName            344 non-null    object 
 1   Sample Number        344 non-null    int64  
 2   Species              344 non-null    object 
 3   Region               344 non-null    object 
 4   Island               344 non-null    object 
 5   Stage                344 non-null    object 
 6   Individual ID        344 non-null    object 
 7   Clutch Completion    344 non-null    object 
 8   Date Egg             344 non-null    object 
 9   Culmen Length (mm)   342 non-null    float64
 10  Culmen Depth (mm)    342 non-null    float64
 11  Flipper Length (mm)  342 non-null    float64
 12  Body Mass (g)        342 non-null    float64
 13  Sex                  334 non-null    object 
 14  Delta 15 N (o/oo)    330 non-null    float64
 15  Delta 13 C (o/oo)    331 non-null    flo

As comments column is heavily downsized and missing we can omit it<br>
As this is heavily specie-related dataset lets clean the species column first<br>
since there is only one Region we can drop it<br>
since there is only one Stage we can drop it

In [7]:
peng = peng_lter.drop(["Comments"],axis=1)
peng["Species"] = peng.Species.str.split(n=1,expand=True)[0]
peng.drop(columns=['Region'],inplace=True)
peng.drop(columns=['Stage'],inplace=True)
peng[['N','A']] = peng['Individual ID'].str.split(r'(\d+)',expand=True)[[1,3]]
peng['Date Egg'] = pd.to_datetime(peng['Date Egg'],format='mixed')
peng.drop(columns=['Individual ID'],inplace=True)
original_columns = peng.columns.to_list()
peng.columns = ['study','samples','species','island','clutch','date','culmenL','culmenD','flipperL','bmass','sex','delta_15n','delta_13c','N','A']
peng[['N','A']] = peng[['N','A']].astype(np.float32)

|Column Name|Description|Type|
|:----------|:----------|:---|
|StudyName|ID/Name of study that was carried out|categorical|
|SampleNumber|Sample of the study|categorical|
|Species|Species of penguin|categorical|
|Island|Island on which the study was conducted|categorical|
|Clutch Completion|$\cdots$|categorical|
|Date Egg|Date on which the study was conducted|continuous|
|Culmen Length|Length Upper ridge of the beak in mm|continuous|
|Flipper Length|Lenght of its flipper|continuous|
|Body Mass|Weight of the penguin|continuous|
|Sex|gender of the penguin|categorical|
|Delta 15 N|$\cdots$|continuous|
|Delta 13 C|$\cdots$|continuous|
|N|Individual Number assigned to penguin|categorical|
|A|Individual Number assigned to penguin|categorical|

In [8]:
!git fetch --prune

In [9]:
temp_df = peng.copy()

In [12]:
tf.config.list_logical_devices()

[LogicalDevice(name='/device:CPU:0', device_type='CPU'),
 LogicalDevice(name='/device:GPU:0', device_type='GPU')]

In [13]:
strategy = tf.distribute.OneDeviceStrategy(device='/device:GPU:0')

In [16]:
(X_train,y_train),(X_test,y_test) = keras.datasets.mnist.load_data()

In [17]:
X_tr = tf.constant(X_train/255,dtype=tf.float32)
X_te = tf.constant(X_test/255,dtype=tf.float32)
y_tr = tf.constant(y_train,dtype=tf.float32)
y_te = tf.constant(y_test,dtype=tf.float32)

In [18]:
with strategy.scope():
    model = keras.Sequential()
    model.add(keras.layers.Flatten(input_shape=[28,28]))
    model.add(keras.layers.Dense(256,"relu"))
    model.add(keras.layers.Dense(256,"relu"))
    model.add(keras.layers.Dense(256,"relu"))
    model.add(keras.layers.Dense(256,"relu"))
    model.add(keras.layers.Dropout(0.4))
    model.add(keras.layers.Dense(10,"softmax"))
    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )

In [19]:
model.fit(X_tr,y_tr,epochs=20,validation_data=(X_te,y_te),batch_size=32)

Epoch 1/20


2023-08-06 16:19:06.174939: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7faa58dd6460 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-08-06 16:19:06.174982: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce GTX 1080 Ti, Compute Capability 6.1
2023-08-06 16:19:06.182606: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-08-06 16:19:06.481602: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600
2023-08-06 16:19:06.638860: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7fac518e9dc0>

In [10]:
# ord_enc = OrdinalEncoder().set_output(transform='pandas')
# temp_df[['study','species','island','clutch','sex']] = ord_enc.fit_transform(temp_df[['study','species','island','clutch','sex']])
# temp_df.head()
# kds = mf.ImputationKernel(data=temp_df.drop(columns=['date']),datasets=5,random_state=1991)
# kds.mice(1)
# plt.rcParams["figure.figsize"] = (20,10)
# kds.plot_imputed_distributions(datasets=1)
# temp_df = kds.complete_data(dataset=1)
# GradientBoostingClassifier().get_params()
# gird_params = dict(
#     learnig_rate=np.logspace(-6,-1,6),
#     max_depth=np.arange(3,16),
#     max_leaf_nodes=np.arange(8,32),
#     n_estimators=np.arange(100,500,50)
#     )