In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../input/fetal-health-classification/fetal_health.csv" , sep = ",")
df.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


## EDA 

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 2126 entries, 0 to 2125

Data columns (total 22 columns):

 #   Column                                                  Non-Null Count  Dtype  

---  ------                                                  --------------  -----  

 0   baseline value                                          2126 non-null   float64

 1   accelerations                                           2126 non-null   float64

 2   fetal_movement                                          2126 non-null   float64

 3   uterine_contractions                                    2126 non-null   float64

 4   light_decelerations                                     2126 non-null   float64

 5   severe_decelerations                                    2126 non-null   float64

 6   prolongued_decelerations                                2126 non-null   float64

 7   abnormal_short_term_variability                         2126 non-null   float64

 8   mean_value_of_short_term_

*Checking for null values*

In [4]:
df.isnull().sum()

baseline value                                            0
accelerations                                             0
fetal_movement                                            0
uterine_contractions                                      0
light_decelerations                                       0
severe_decelerations                                      0
prolongued_decelerations                                  0
abnormal_short_term_variability                           0
mean_value_of_short_term_variability                      0
percentage_of_time_with_abnormal_long_term_variability    0
mean_value_of_long_term_variability                       0
histogram_width                                           0
histogram_min                                             0
histogram_max                                             0
histogram_number_of_peaks                                 0
histogram_number_of_zeroes                                0
histogram_mode                          

*Removing Duplicates*

In [5]:
df.duplicated().sum()

13

In [6]:
df=df.drop_duplicates()

*Exploring The distribution of the data*

In [7]:
df.shape

(2113, 22)

- # **AutoML**

## What is AutoML?

- **Automated Machine Learning (AutoML)** is the process of automating machine learning workflows. In an ideal situation, we, as the users, only need to provide a dataset. The AutoML tool should automatically produce good-performing model pipelines for us.

In [26]:
from IPython.display import Image

Image(url='https://sp-ao.shortpixel.ai/client/to_webp,q_glossy,ret_img,w_622/https://www.justintodata.com/wp-content/uploads/2022/03/image-1.png')

So AutoML should handle tasks like:
- data preprocessing
- algorithm selection
- hyperparameter tuning
- model training

## EvalML

In [29]:
x = df.drop(["fetal_health"] , axis = 1)
y = df["fetal_health"].values
x_train , x_test , y_train ,y_test = train_test_split(x,y , test_size= 0.25 , random_state= 42)

## Loading The Dataset

In [27]:
import evalml

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [30]:
x_train,x_test, y_train, y_test= evalml.preprocessing.split_data(x,y,problem_type ='MULTICLASS')

In [31]:
x_train.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency
560,130.0,0.0,0.0,0.0,0.007,0.0,0.0,37.0,1.8,0.0,...,90.0,58.0,148.0,4.0,1.0,127.0,121.0,126.0,21.0,1.0
1482,132.0,0.0,0.0,0.007,0.001,0.0,0.0,47.0,0.7,1.0,...,31.0,114.0,145.0,1.0,0.0,136.0,134.0,136.0,2.0,1.0
2074,130.0,0.008,0.001,0.005,0.0,0.0,0.0,70.0,0.7,6.0,...,31.0,127.0,158.0,1.0,0.0,139.0,140.0,141.0,3.0,0.0
953,136.0,0.005,0.0,0.003,0.004,0.0,0.0,37.0,1.2,0.0,...,94.0,64.0,158.0,2.0,0.0,139.0,137.0,141.0,21.0,1.0
812,146.0,0.003,0.0,0.004,0.002,0.0,0.0,38.0,1.2,33.0,...,78.0,94.0,172.0,2.0,3.0,162.0,154.0,158.0,19.0,1.0


In [32]:
from evalml.automl import AutoMLSearch
automl= AutoMLSearch(X_train= x_train , y_train= y_train, problem_type='multiclass')
automl.search()


You can set `force_row_wise=true` to remove the overhead.

And if memory is not enough, you can set `force_col_wise=true`.

[LightGBM] [Info] Total Bins 1216

[LightGBM] [Info] Number of data points in the train set: 1315, number of used features: 11

[LightGBM] [Info] Start training from score -0.405085

[LightGBM] [Info] Start training from score -1.792520

[LightGBM] [Info] Start training from score -1.792520




You can set `force_col_wise=true` to remove the overhead.

[LightGBM] [Info] Total Bins 1325

[LightGBM] [Info] Number of data points in the train set: 1315, number of used features: 11

[LightGBM] [Info] Start training from score -0.405085

[LightGBM] [Info] Start training from score -1.792520

[LightGBM] [Info] Start training from score -1.792520




You can set `force_col_wise=true` to remove the overhead.

[LightGBM] [Info] Total Bins 1214

[LightGBM] [Info] Number of data points in the train set: 1316, number of used features: 11

[LightGBM] [Info] Start training from

The y_pred values do not sum to one. Starting from 1.5 thiswill result in an error.

The y_pred values do not sum to one. Starting from 1.5 thiswill result in an error.

The y_pred values do not sum to one. Starting from 1.5 thiswill result in an error.


{1: {'Random Forest Classifier w/ Label Encoder + Imputer + Oversampler + RF Classifier Select From Model': 4.051091194152832,
  'Total time of batch': 4.205659627914429},
 2: {'LightGBM Classifier w/ Label Encoder + Imputer + Oversampler + Select Columns Transformer': 3.3338782787323,
  'Extra Trees Classifier w/ Label Encoder + Imputer + Oversampler + Select Columns Transformer': 2.1740009784698486,
  'Elastic Net Classifier w/ Label Encoder + Imputer + Oversampler + Standard Scaler + Select Columns Transformer': 2.275012493133545,
  'XGBoost Classifier w/ Label Encoder + Imputer + Oversampler + Select Columns Transformer': 3.9371609687805176,
  'Logistic Regression Classifier w/ Label Encoder + Imputer + Oversampler + Standard Scaler + Select Columns Transformer': 5.867302179336548,
  'Total time of batch': 18.280128240585327}}

In [33]:
automl.rankings

Unnamed: 0,id,pipeline_name,search_order,ranking_score,mean_cv_score,standard_deviation_cv_score,percent_better_than_baseline,high_variance_cv,parameters
0,5,XGBoost Classifier w/ Label Encoder + Imputer ...,5,0.175004,0.175004,0.011345,97.806006,False,"{'Label Encoder': {'positive_label': None}, 'I..."
1,1,Random Forest Classifier w/ Label Encoder + Im...,1,0.221909,0.221909,0.027801,97.217973,False,"{'Label Encoder': {'positive_label': None}, 'I..."
2,4,Elastic Net Classifier w/ Label Encoder + Impu...,4,0.26318,0.26318,0.011976,96.700571,False,"{'Label Encoder': {'positive_label': None}, 'I..."
3,6,Logistic Regression Classifier w/ Label Encode...,6,0.263415,0.263415,0.012138,96.697616,False,"{'Label Encoder': {'positive_label': None}, 'I..."
4,2,LightGBM Classifier w/ Label Encoder + Imputer...,2,0.265697,0.265697,0.017839,96.669008,False,"{'Label Encoder': {'positive_label': None}, 'I..."
5,3,Extra Trees Classifier w/ Label Encoder + Impu...,3,0.319212,0.319212,0.004525,95.998105,False,"{'Label Encoder': {'positive_label': None}, 'I..."
6,0,Mode Baseline Multiclass Classification Pipeline,0,7.976517,7.976517,0.033623,0.0,False,"{'Label Encoder': {'positive_label': None}, 'B..."


In [34]:
best_pipeline =automl.best_pipeline
best_pipeline

pipeline = MulticlassClassificationPipeline(component_graph={'Label Encoder': ['Label Encoder', 'X', 'y'], 'Imputer': ['Imputer', 'X', 'Label Encoder.y'], 'Oversampler': ['Oversampler', 'Imputer.x', 'Label Encoder.y'], 'Select Columns Transformer': ['Select Columns Transformer', 'Oversampler.x', 'Oversampler.y'], 'XGBoost Classifier': ['XGBoost Classifier', 'Select Columns Transformer.x', 'Oversampler.y']}, parameters={'Label Encoder':{'positive_label': None}, 'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'boolean_impute_strategy': 'most_frequent', 'categorical_fill_value': None, 'numeric_fill_value': None, 'boolean_fill_value': None}, 'Oversampler':{'sampling_ratio': 0.25, 'k_neighbors_default': 5, 'n_jobs': -1, 'sampling_ratio_dict': None, 'k_neighbors': 5}, 'Select Columns Transformer':{'columns': ['baseline value', 'accelerations', 'prolongued_decelerations', 'abnormal_short_term_variability', 'mean_value_of_short_term_variability', '

In [35]:
automl.describe_pipeline(automl.rankings.iloc[0]['id'])



********************************************************************************************

* XGBoost Classifier w/ Label Encoder + Imputer + Oversampler + Select Columns Transformer *

********************************************************************************************



Problem Type: multiclass

Model Family: XGBoost



Pipeline Steps


1. Label Encoder

	 * positive_label : None

2. Imputer

	 * categorical_impute_strategy : most_frequent

	 * numeric_impute_strategy : mean

	 * boolean_impute_strategy : most_frequent

	 * categorical_fill_value : None

	 * numeric_fill_value : None

	 * boolean_fill_value : None

3. Oversampler

	 * sampling_ratio : 0.25

	 * k_neighbors_default : 5

	 * n_jobs : -1

	 * sampling_ratio_dict : None

	 * k_neighbors : 5

4. Select Columns Transformer

	 * columns : ['baseline value', 'accelerations', 'prolongued_decelerations', 'abnormal_short_term_variability', 'mean_value_of_short_term_variability', 'percentage_of_time_with_abnormal_lo

In [36]:
best_pipeline.score(x_test,y_test, objectives = ['F1 Weighted','F1 Macro','F1 Micro','Accuracy Multiclass','Precision Weighted','Precision Macro','Precision Micro'])

OrderedDict([('F1 Weighted', 0.9495118218522473),
             ('F1 Macro', 0.9143715143715143),
             ('F1 Micro', 0.950354609929078),
             ('Accuracy Multiclass', 0.950354609929078),
             ('Precision Weighted', 0.9492168489813744),
             ('Precision Macro', 0.9249869252122137),
             ('Precision Micro', 0.950354609929078)])

In [37]:
from evalml.objectives import get_optimization_objectives
from evalml.problem_types import ProblemTypes
for objective in get_optimization_objectives(ProblemTypes.MULTICLASS):
    print(objective.name)

MCC Multiclass

Log Loss Multiclass

AUC Weighted

AUC Macro

AUC Micro

Precision Weighted

Precision Macro

Precision Micro

F1 Weighted

F1 Macro

F1 Micro

Balanced Accuracy Multiclass

Accuracy Multiclass


In [38]:
# if I want to tune my model based on f1 , recall , precision , etc..
automl_customized= AutoMLSearch(X_train= x_train, y_train = y_train,
                               problem_type='multiclass',
                               objective= 'F1 Weighted',
                               additional_objectives=['F1 Macro','F1 Micro','Accuracy Multiclass','Precision Weighted','Precision Macro','Precision Micro'],
                               max_batches=1,
                               optimize_thresholds= True)
automl_customized.search()

{1: {'Random Forest Classifier w/ Label Encoder + Imputer + Oversampler + RF Classifier Select From Model': 2.5808777809143066,
  'Total time of batch': 2.7113780975341797}}

In [39]:
automl_customized.rankings

Unnamed: 0,id,pipeline_name,search_order,ranking_score,mean_cv_score,standard_deviation_cv_score,percent_better_than_baseline,high_variance_cv,parameters
0,1,Random Forest Classifier w/ Label Encoder + Im...,1,0.923375,0.923375,0.009303,24.156032,False,"{'Label Encoder': {'positive_label': None}, 'I..."
1,0,Mode Baseline Multiclass Classification Pipeline,0,0.681815,0.681815,0.001276,0.0,False,"{'Label Encoder': {'positive_label': None}, 'B..."


In [40]:
automl_customized.describe_pipeline(automl_customized.rankings.iloc[0]['id'])



*******************************************************************************************************

* Random Forest Classifier w/ Label Encoder + Imputer + Oversampler + RF Classifier Select From Model *

*******************************************************************************************************



Problem Type: multiclass

Model Family: Random Forest



Pipeline Steps


1. Label Encoder

	 * positive_label : None

2. Imputer

	 * categorical_impute_strategy : most_frequent

	 * numeric_impute_strategy : mean

	 * boolean_impute_strategy : most_frequent

	 * categorical_fill_value : None

	 * numeric_fill_value : None

	 * boolean_fill_value : None

3. Oversampler

	 * sampling_ratio : 0.25

	 * k_neighbors_default : 5

	 * n_jobs : -1

	 * sampling_ratio_dict : None

	 * k_neighbors : 5

4. RF Classifier Select From Model

	 * number_features : None

	 * n_estimators : 10

	 * max_depth : None

	 * percent_features : 0.5

	 * threshold : median

	 * n_jobs : -1

5. 