# Machine Learning Fundamentals: Exploratory Data Analysis, Data Splitting, and Data Standardization
*Gaetano Scebba* - ML '23 Summer School Bumblekite

In [3]:
!pip install matplotlib==3.7.1 -q
!pip install numpy==1.23.3 -q
!pip install sckit-learn==1.2.0 -q
!pip install scikit-multilearn==0.2.0 -q
!pip install scipy==1.9.3 -q
!pip install seaborn==0.12.2 -q
!pip install tqdm==4.64.1 -q

[31mERROR: Could not find a version that satisfies the requirement sckit-learn==1.2.0 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for sckit-learn==1.2.0[0m[31m
[0m

In [24]:
import pandas as pd
import numpy as np

In [46]:
# import dataset aggregated metadata
import json

with open('tutorial_metadata_v.2.json', 'r') as f:
  metadata = json.load(f)

print(metadata.keys())



dict_keys(['pid', 'covariates', 'quality', 'total_hours', 'available_hours'])


In [47]:
# create a data frame from the metadata where pid is the index


df = pd.DataFrame.from_dict(metadata, orient='index').transpose()




In [48]:
df 
# extract the values in the dict in the column "covariates" and create a new dataframe from it
df_covariates = pd.DataFrame.from_dict(df['covariates'].to_dict(), orient='index')
# add the covariates to the original dataframe
df = pd.concat([df, df_covariates], axis=1)
# remove the covariates column
df = df.drop(columns=['covariates'])




In [49]:
# calculate the norm of the vector of the column "quality" and add it as a new column
df['quality_norm'] = df['quality'].apply(lambda x: np.average(x))

In [50]:
df.head()

Unnamed: 0,pid,quality,total_hours,available_hours,age,female,male,other,rosc,ohca,vfib,ttm,outcome,cpc,quality_norm
0,ICARE_0284,"[0.8684736661956489, 0.8684736661956489, 0.868...",72,17,53.0,0,1,0,,True,True,33.0,0,1.0,0.868474
1,ICARE_0286,"[0.14386576470222, 0.14386576470222, 0.1438657...",72,17,85.0,1,0,0,7.0,True,True,,0,1.0,0.143866
2,ICARE_0296,"[0.12318490277562366, 0.12318490277562366, 0.1...",72,3,48.0,0,1,0,,True,True,36.0,0,1.0,0.123185
3,ICARE_0299,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",72,23,45.0,0,1,0,,True,True,33.0,0,1.0,1.0
4,ICARE_0303,"[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",72,42,51.0,0,1,0,24.0,True,True,33.0,0,1.0,0.943643


In [51]:
df["outcome"]

0      0
1      0
2      0
3      0
4      0
      ..
602    1
603    1
604    1
605    0
606    0
Name: outcome, Length: 607, dtype: int64

In [41]:
# do a train test validation split with equal distribution of the attribute "outcome"

In [52]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['quality_norm'], df['outcome'], test_size=0.2, random_state=42, stratify=df['outcome'])



In [55]:
# split the test set into test and validation set
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)


In [122]:
# create an array of the number of samples in each set
n_samples = np.array([len(X_train), len(X_test), len(X_val)])
# create an array of the ratio of samples in each set
ratio_samples = n_samples / len(df)
# create an array of the number of positive samples in each set
n_positives = np.array([y_train.value_counts()[0], y_test.value_counts()[0], y_val.value_counts()[0]])
# create an array of the ratio of positive samples in each set
ratio_positives = n_positives / n_samples

# create a dataframe with and train, test, validation as columns and the values as rows
df_split = pd.DataFrame([n_samples,ratio_samples, n_positives, ratio_positives], columns=['train', 'test', 'validation'], index=['n_samples','ratio_samples', ' n_poor_outcomes','ratio_poor_outcomes'])

# transform n_samples and n_poor_outcomes to int
df_split.loc[['n_samples',' n_poor_outcomes']] = df_split.loc[['n_samples',' n_poor_outcomes']].astype(int)

# round the values to 2 decimals
df_split = df_split.round(2)



In [123]:
df_split

Unnamed: 0,train,test,validation
n_samples,485.0,61.0,61.0
ratio_samples,0.8,0.1,0.1
n_poor_outcomes,180.0,22.0,23.0
ratio_poor_outcomes,0.37,0.36,0.38
