Context
Reduction of child mortality is reflected in several of the United Nations' Sustainable Development Goals and is a key indicator of human progress.
The UN expects that by 2030, countries end preventable deaths of newborns and children under 5 years of age, with all countries aiming to reduce under‑5 mortality to at least as low as 25 per 1,000 live births.

Parallel to notion of child mortality is of course maternal mortality, which accounts for 295 000 deaths during and following pregnancy and childbirth (as of 2017). The vast majority of these deaths (94%) occurred in low-resource settings, and most could have been prevented.

In light of what was mentioned above, Cardiotocograms (CTGs) are a simple and cost accessible option to assess fetal health, allowing healthcare professionals to take action in order to prevent child and maternal mortality. The equipment itself works by sending ultrasound pulses and reading its response, thus shedding light on fetal heart rate (FHR), fetal movements, uterine contractions and more.

Data
This dataset contains 2126 records of features extracted from Cardiotocogram exams, which were then classified by three expert obstetritians into 3 classes:

Normal
Suspect
Pathological

In [None]:
# Importing essential libraries
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df=pd.read_csv("fetal_health_classification.csv")

In [None]:
df.head()

In [None]:
df.describe().T

In [None]:
sns.countplot(x="fetal_health",data=df)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df.drop('fetal_health',axis=1).corr(),annot=True)

In [None]:
df.columns

In [None]:
# Accelerations Vs Fetal Movement by Fetal Health
sns.lmplot(data=df,x="accelerations",y="fetal_movement",hue="fetal_health")

In [None]:
# Prolongued Decelerations Vs Fetal Movement by Fetal Health
sns.lmplot(data=df,x="prolongued_decelerations",y="fetal_movement",hue="fetal_health")
plt.show()

In [None]:
# Abnormal Short Term Variability Vs Fetal Movement by Fetal Health
sns.lmplot(data=df,x="abnormal_short_term_variability",y="fetal_movement",hue="fetal_health")
plt.show()

In [None]:
# Mean Value Of Long Term Variability Vs Fetal Movement by Fetal Health
sns.lmplot(data=df,x="mean_value_of_long_term_variability",y="fetal_movement",hue="fetal_health")
plt.show()

In [None]:
# check the outliers
columns=['baseline value', 'accelerations', 'fetal_movement',
       'uterine_contractions', 'light_decelerations', 'severe_decelerations',
       'prolongued_decelerations', 'abnormal_short_term_variability',
       'mean_value_of_short_term_variability',
       'percentage_of_time_with_abnormal_long_term_variability',
       'mean_value_of_long_term_variability', 'histogram_width',
       'histogram_min', 'histogram_max', 'histogram_number_of_peaks',
       'histogram_number_of_zeroes', 'histogram_mode', 'histogram_mean',
       'histogram_median', 'histogram_variance', 'histogram_tendency',
       'fetal_health']
columns

In [None]:
# checking for column distribution

plt.figure(figsize=(10,10))
plot_number=1
for column in df:
    if plot_number<=21:
        ax=plt.subplot(7,3,plot_number)
        sns.histplot(df[column])
        plt.xlabel(column)
        plt.ylabel('count')
    plot_number+=1

In [None]:
# checking for column distribution
plt.figure(figsize=(10,10))
plot_number=1
for column in df:
    if plot_number<=21:
        ax=plt.subplot(7,3,plot_number)
        sns.histplot(df[column])
        plt.xlabel(column)
        plt.ylabel('count')
    plot_number+=1
plt.tight_layout()    

In [None]:
# checking for column distribution
import warnings
warnings.filterwarnings('ignore')
plt.figure(figsize=(15,15))
plot_number=1
for column in df:
    if plot_number<=21:
        ax=plt.subplot(7,3,plot_number)
        sns.boxplot(df[column])
        plt.xlabel(column)
        plt.ylabel('count')
    plot_number+=1

In [None]:
shades =["#f7b2b0","#c98ea6","#8f7198","#50587f", "#003f5c"]
plt.figure(figsize=(20,10))
sns.boxenplot(data = df,palette = shades)
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.boxplot(data=df)

In [None]:
# sns.pairplot(df)

# MODEL SELECTION AND BUILDING
In this section we will:

Set up features(X) and target(Y)
Scale the features
Split training and test sets
Model selection
Hyperparameter tuning

In [None]:
X=df.drop('fetal_health',axis=1)
y=df.fetal_health

In [None]:
X.shape

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [None]:
cols=list(X.columns)
cols

In [None]:
X_df=scaler.fit_transform(X)

In [None]:
X_df=pd.DataFrame(X_df,columns=cols)

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(data=X_df)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Splitting data X & y
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_df,y,test_size=0.3,random_state=42)

In [None]:
X_train

In [None]:
lr=LogisticRegression()
lr.fit(X_train,y_train)

In [None]:
y_pred=lr.predict(X_test)

In [None]:
confusion_matrix(y_pred,y_test)

In [None]:
accuracy_score(y_pred,y_test)

In [None]:
recall_score(y_pred,y_test,average='weighted')

In [None]:
precision_score(y_pred,y_test,average='weighted')

In [None]:
f1_score(y_pred,y_test,average='macro')

In [None]:
X_test.shape

In [None]:
y_pred[:13]

In [None]:
y_test.head(13)

In [None]:
pd.DataFrame(lr.predict_proba(X_test))

In [None]:
print(classification_report(y_pred,y_test))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,f1_score,precision_score,recall_score
from sklearn.metrics import roc_auc_score,roc_curve,auc

In [None]:
# A quick model selection process
from sklearn.pipeline import Pipeline

In [None]:
pipeline_lr=Pipeline([('lr_classifier',LogisticRegression(random_state=42))])
pipeline_dt=Pipeline([("dt_classifier",DecisionTreeClassifier(random_state=42))])
pipeline_svc=Pipeline([('sv_classifier',SVC())])
pipeline_rf=Pipeline([('rf_classifier',RandomForestClassifier())])


In [None]:
pipelines=[pipeline_lr,pipeline_dt,pipeline_svc,pipeline_rf]
pip_dic={0:"Logistic Regression",1:"Decision Tree",2:"SVC",3:"Random Forest"}

In [None]:
for pipe in pipelines:
    pipe.fit(X_train,y_train)
    

In [None]:
#cross validation on accuracy
cv_result_ac=[]
from sklearn.model_selection import cross_val_score

In [None]:
for i, model in enumerate (pipelines):
    cv_score=cross_val_score(model,X_train,y_train,cv=10)
    cv_result_ac.append(cv_score)
    print("%s:%f" %(pip_dic[i],cv_score.mean()))

In [None]:
# Predict X_test
pred_rfc=pipeline_rf.predict(X_test)
ac=accuracy_score(y_test,pred_rfc)
print(ac)

In [None]:
pred_lr=pipeline_lr.predict(X_test)
ac=accuracy_score(y_test,pred_lr)
ac

In [None]:
#Building a dictionalry with list of optional values that will me analyesed by GridSearch CV
parameters = { 
    'n_estimators': [100,150, 200,500,700,900],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,6,8,12,14,16],
    'criterion' :['gini', 'entropy'],
    'n_jobs':[-1,1,None]}

In [None]:
#Fitting the trainingset to find parameters with best accuracy

CV_rfc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parameters, cv= 5)
CV_rfc.fit(X_train, y_train)

#Getting the outcome of gridsearch

CV_rfc.best_params_