<a href="https://colab.research.google.com/github/dsrrenCodes/NTU-datathon-2025-Lifeline/blob/main/NTUdatathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import warnings
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import joblib
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

### EDA
- 3 rows with missing values (very small percentage so it is safe to remove)
- Our NSP values are disproportionate, so need to apply SMOTE(Synthetic Minority Over-sampling Technique)
- Although all dtype is numeric, there are some categorical variables (['NSP','CLASS','Tendency','Nzeros','DP','DS'])
- consider using feature selection techniques due to high no. of features
- present outliers, need to remove outliers using KNNImputer

In [8]:
explore=pd.read_excel('/content/CTG.xls',sheet_name=1,header=1)
explore = explore.loc[:, ~explore.columns.str.contains('Unnamed:')]
complete_features=['b','e','LB','AC','FM','UC','DL','DS','DP','ASTV','MSTV','ALTV','MLTV','Width','Min','Max','Nmax','Nzeros','Mode','Mean','Median','Variance','Tendency','CLASS','NSP']

explore=explore[complete_features]
df=explore.copy()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.isna().sum()/len(df) *100

In [None]:
#imbalance data need use smote
df.dropna(subset=['NSP'])['NSP'].value_counts().plot(kind='bar')

In [None]:
df=df.dropna(subset=['NSP'])

In [None]:
#the only nans are only associated with NSP, 3 rows so can just safely remove them
explore[explore['NSP'].isna()]

In [None]:
# ['CLASS','Tendency','Nzeros','DP','DS'] clearly categorical
cat_columns=['CLASS','Tendency','Nzeros','DP','DS']
fig,ax = plt.subplots(5,1,figsize=(10,20))
for c in cat_columns:
  sns.countplot(data=explore,x=c,ax=ax[cat_columns.index(c)])

In [None]:
#outliers
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
df_scale = pd.DataFrame(sklearn.preprocessing.scale(df), columns=df.columns)
df_scale.boxplot(ax=ax, rot=45)
plt.show()

### exploring new features
- created ASTV_MSTV_ratio, ALTV_MLTV_ratio, Heart_Rate_Range, AC_UC_interaction,Variability_Score
- compared their correlation against NSP to checked relevance, discovered(Heart_Rate_Range) not that relevant

In [None]:
print(f"   Created 5 new domain-specific features:")
print(f"   - ASTV_MSTV_ratio: Short-term vs Medium-term variability ratio")
print(f"   - ALTV_MLTV_ratio: Long-term variability ratio")
print(f"   - Heart_Rate_Range: Range of heart rate values")
print(f"   - AC_UC_interaction: Acceleration-Uterine contraction interaction")
print(f"   - Variability_Score: Combined variability measure")



#heart_rate_range
df['heart_rate_range'] = df['Max'] - df['Min']
#ASTV_MSTV_ratio
df['ASTV_MSTV_ratio']= df['ASTV'] / (df['MSTV'] + 1e-8)
#ALTV_MLTV_ratio
df['ALTV_MLTV_ratio']= df['ALTV'] / (df['MLTV'] + 1e-8)
#Variability composite score
df['Variability_Score'] = (df['ASTV'] + df['ALTV']) / 2
#AC_UC_interaction
df['AC_UC_interaction'] = df['AC'] * df['UC']



heart_rate_range not that correlated to NSP so will exclude heart_rate_range

In [None]:
plt.figure(figsize=(10, 8))
ax = sns.heatmap(df.corr()['NSP'].to_frame().sort_values(by='NSP'), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation of Features with NSP')
plt.show()

### Numerical analysis

- identifying skewed features
- apply log transformation so that i can apply standardization to them later

In [None]:
cat_var=['NSP','CLASS','Tendency','Nzeros','DP','DS']
num_var=df.columns.difference(cat_var)
#exclude heart_rate_range
num_var=num_var[:-1]

num_var

In [None]:
#histogram of all numerical features, so i can see which features are skewed (so we can apply log transformation)
df[num_var].hist(figsize=(10, 15))
plt.tight_layout()
plt.show()

In [None]:
#plot of skewed features after log transformation
features_to_log_transform = [
    "AC",
    "ALTV",
    "DL",
    "FM",
    "MLTV",
    "MSTV",
    "UC",
    "Variance",
    'ALTV_MLTV_ratio',
    'ASTV_MSTV_ratio',
    'Variability_Score',
    'AC_UC_interaction',
    'b']

for feature in features_to_log_transform:
    df[feature] = np.log1p(df[feature])

df[features_to_log_transform].hist(figsize=(10, 15))
plt.suptitle('Histograms of Log-Transformed Numerical Features', y=1.02)
plt.tight_layout()
plt.show()

### Data Preprocessing

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split


from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline



train test split

In [10]:
#reload data

explore=pd.read_excel('/content/CTG.xls',sheet_name=1,header=1)
explore = explore.loc[:, ~explore.columns.str.contains('Unnamed:')]
complete_features=['b','e','LB','AC','FM','UC','DL','DS','DP','ASTV','MSTV','ALTV','MLTV','Width','Min','Max','Nmax','Nzeros','Mode','Mean','Median','Variance','Tendency','CLASS','NSP']


explore=explore[complete_features]

df=explore.copy()
df=df.dropna(subset=['NSP'])

#split

y=df['NSP']
X=df.drop('NSP',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'len of train data {len(X_train)}')
print(f'len of test data {len(X_test)}')

len of train data 1700
len of test data 426


pipeline for feature engineering/ preprocess data

In [11]:
from sklearn.feature_selection import SelectKBest, f_classif
import copy
from sklearn.impute import KNNImputer


num_var=['Max', 'Median', 'Mean', 'ASTV', 'Mode', 'Nmax', 'Min', 'LB', 'Width','e']

features_to_log_transform = [
    "AC",
    "ALTV",
    "DL",
    "FM",
    "MLTV",
    "MSTV",
    "UC",
    "Variance",
    'ALTV_MLTV_ratio',
    'ASTV_MSTV_ratio',
    'Variability_Score',
    'AC_UC_interaction',
    'b']

#adding additional features
def add_features(df):
    df = df.copy()
    df['heart_rate_range'] = df['Max'] - df['Min']
    df['ASTV_MSTV_ratio'] = df['ASTV'] / (df['MSTV'] + 1e-8)
    df['ALTV_MLTV_ratio'] = df['ALTV'] / (df['MLTV'] + 1e-8)
    df['Variability_Score'] = (df['ASTV'] + df['ALTV']) / 2
    df['AC_UC_interaction'] = df['AC'] * df['UC']
    return df

add_features_transformer=FunctionTransformer(add_features)

#log
log_transformer=FunctionTransformer(func=np.log1p,feature_names_out='one-to-one')

#smote for imblanance data
smote=SMOTE(random_state=42)


def outlier_removal(X, threshold=7):
    """ Sets feature values in X that are more than (threshold * feature standard deviation) away from feature mean
    to NaN. Returns X with original length but some column values are NaN. At default value 100, no outlier treatment occurs.
    """
    new_X = copy.deepcopy(X)
    # Use original column names to create a DataFrame from the scaled array
    new_X_scaled = sklearn.preprocessing.scale(X)
    new_X[abs(new_X_scaled) > threshold] = np.nan

    return new_X

# Make zscore feature outlier removal a transformer function
zscore_outlier_removal_transformer = sklearn.preprocessing.FunctionTransformer(func=outlier_removal,
    kw_args=dict(threshold=7))


# Replace feature outliers with imputed values via KNN
KNN_impute = KNNImputer()

#features_to_log_transform
log_pipeline=make_pipeline(
    SimpleImputer(strategy='median'),
    zscore_outlier_removal_transformer,
    KNN_impute,
    FunctionTransformer(func=np.log1p,feature_names_out='one-to-one'),
    StandardScaler()
)

#num_var features (no overlap)
default_num_pipeline=make_pipeline(
    SimpleImputer(strategy='median'),
    zscore_outlier_removal_transformer,
    KNN_impute,
    StandardScaler()
)


preprocessing=ColumnTransformer([
    ('log_pipeline',log_pipeline,features_to_log_transform),
    ('default_num_pipeline',default_num_pipeline,num_var)
])


### Modelling

Main Pipeline example

In [13]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score



lr=LogisticRegression()

main_pipeline = ImbPipeline([
    ('add_features', add_features_transformer),
    ('preprocess', preprocessing),
    ('smote', smote),
    ('feature_selection', SelectKBest(score_func=f_classif, k=30)),
    ('classifier', lr)
])

main_pipeline

In [14]:
p = df.copy()
p = main_pipeline['add_features'].transform(p)
print(len(p.columns))

30


Random Forest(with tuning)

In [None]:
from sklearn.ensemble import RandomForestClassifier
tree_numbers = [20, 50, 100, 200,300,400]
tree_depths = [1, 2, 3, 4, 5, 6, 7, 8,9,10]
max_feature_vals = [5, 8, 10, 15,20,25,30]
#k_selection=[20,22,24,25,26,27,28,29,30] not needed for tree models but include this for other models

scoring = {
    'balanced_accuracy': 'balanced_accuracy',
    'f1_macro': 'f1_macro'
}


random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=2,
    min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=10, random_state=42, class_weight='balanced')

main_pipeline = ImbPipeline([
    ('add_features', add_features_transformer),
    ('preprocess', preprocessing),
    ('smote', smote),
   # ('feature_selection', SelectKBest(score_func=f_classif, k=30)),
    ('classifier', random_forest)
])

random_forest_grid = {"classifier__n_estimators": tree_numbers,
    "classifier__max_depth": tree_depths,
    "classifier__max_features": max_feature_vals}

random_forest_cv = sklearn.model_selection.RandomizedSearchCV(main_pipeline, random_forest_grid,
    scoring=scoring, n_jobs=-1, refit='f1_macro', verbose=True, return_train_score=True, cv=10)
random_forest_cv.fit(X_train,y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [None]:
random_forest_cv.best_score_

np.float64(0.8797981642929399)

In [None]:
random_forest_cv.best_params_

{'classifier__n_estimators': 200,
 'classifier__max_features': 15,
 'classifier__max_depth': 9}

In [None]:
from sklearn.metrics import f1_score, balanced_accuracy_score

pred=random_forest_cv.best_estimator_.predict(X_test)
balanced_accuracy_score(y_test,pred)

np.float64(0.9196087251044148)

In [None]:
f1_score(y_test,pred,average='macro')

0.8892203451826837

In [None]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

         1.0       0.97      0.93      0.95       333
         2.0       0.74      0.86      0.80        64
         3.0       0.88      0.97      0.92        29

    accuracy                           0.92       426
   macro avg       0.86      0.92      0.89       426
weighted avg       0.93      0.92      0.93       426



In [None]:
#store pipeline and model

random_forest_main_pipeline=random_forest_cv.best_estimator_
best_random_forest_model_pipeline=random_forest_cv.best_estimator_
joblib.dump(random_forest_main_pipeline, 'random_forest_model_pipeline')

joblib.dump(random_forest_main_pipeline['classifier'],'random_forest_model')

['random_forest_model']

gradient boosting classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import sklearn.model_selection

scoring = {
    'balanced_accuracy': 'balanced_accuracy',
    'f1_macro': 'f1_macro'
}

tree_numbers = [20, 50, 100, 200, 300, 400]
tree_depths = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
max_feature_vals = [5, 8, 10, 15, 20, 25, 30]


gradient_boosting = GradientBoostingClassifier(random_state=42)


main_pipeline_gb = ImbPipeline([
    ('add_features', add_features_transformer),
    ('preprocess', preprocessing),
    ('smote', smote),
    ('classifier', gradient_boosting)
])

# Hyperparameter search grid
gradient_boosting_grid = {
    "classifier__n_estimators": tree_numbers,
    "classifier__max_depth": tree_depths,
    "classifier__max_features": max_feature_vals
}

gradient_boosting_cv = sklearn.model_selection.RandomizedSearchCV(
    main_pipeline_gb,
    gradient_boosting_grid,
    scoring=scoring,
    n_jobs=-1,
    refit='f1_macro',
    verbose=True,
    return_train_score=True,
    cv=10
)


gradient_boosting_cv.fit(X_train, y_train)


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [None]:
pred=gradient_boosting_cv.best_estimator_.predict(X_test)
print(f'test balance accuracy score: {balanced_accuracy_score(y_test,pred)}')
print(f'test f1 score: {f1_score(y_test,pred,average='macro')}')
print(classification_report(y_test,pred))

test balance accuracy score: 0.9246137301094198
test f1 score: 0.9018622594155428
              precision    recall  f1-score   support

         1.0       0.97      0.95      0.96       333
         2.0       0.80      0.86      0.83        64
         3.0       0.88      0.97      0.92        29

    accuracy                           0.94       426
   macro avg       0.88      0.92      0.90       426
weighted avg       0.94      0.94      0.94       426



In [None]:
gradient_boosting_main_pipeline=gradient_boosting_cv.best_estimator_

In [None]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

         1.0       0.97      0.95      0.96       333
         2.0       0.80      0.86      0.83        64
         3.0       0.88      0.97      0.92        29

    accuracy                           0.94       426
   macro avg       0.88      0.92      0.90       426
weighted avg       0.94      0.94      0.94       426



In [None]:
#saving model and pipeline
best_gbm_model_pipeline=gradient_boosting_cv.best_estimator_

joblib.dump(gradient_boosting_main_pipeline, 'gradient_boosting_model_pipeline')

joblib.dump(gradient_boosting_main_pipeline['classifier'],'gradient_boosting_model')

['gradient_boosting_model']

In [None]:
y_train

Unnamed: 0,NSP
1233,1.0
480,2.0
1111,1.0
1303,1.0
861,1.0
...,...
1638,1.0
1095,1.0
1130,1.0
1294,1.0


XGB Classifier

In [None]:
import xgboost
from scipy.stats import randint, uniform
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_xgb = le.fit_transform(y_train)


param_dist = {
    'classifier__max_depth': randint(3, 12),          # Depth range for each tree
    'classifier__learning_rate': uniform(0.01, 0.3),  # Step size shrinkage, commonly in [0.01, 0.3]
    'classifier__n_estimators': randint(100, 1000),   # Number of boosting rounds
    'classifier__subsample': uniform(0.6, 0.4),       # Fraction of samples used per tree (0.6 to 1.0)
    'classifier__colsample_bytree': uniform(0.5, 0.5),# Fraction of features used per tree (0.5 to 1.0)
    'classifier__min_child_weight': randint(1, 10),   # Minimum sum of instance weight (hessian) in a child
    'classifier__gamma': uniform(0, 5),               # Minimum loss reduction for further split
    'classifier__reg_alpha': uniform(0, 1),           # L1 regularization on weights
    'classifier__reg_lambda': uniform(1, 2)           # L2 regularization on weights
}


xgb_clf = xgboost.XGBClassifier(random_state=42)


main_pipeline_xgb = ImbPipeline([
    ('add_features', add_features_transformer),
    ('preprocess', preprocessing),
    ('smote', smote),
    ('classifier', xgb_clf)
])


xgb_cv=sklearn.model_selection.RandomizedSearchCV(
    main_pipeline_xgb,
    param_dist,
    scoring=scoring,
    n_jobs=-1,
    refit='f1_macro',
    verbose=True,
    return_train_score=True,
    cv=10
)

xgb_cv.fit(X_train, y_train_xgb)

pred = le.inverse_transform(xgb_cv.best_estimator_.predict(X_test))

print("Balanced Accuracy:", balanced_accuracy_score(y_test, pred))
print("F1 Score (macro):", f1_score(y_test, pred, average='macro'))
print("\nClassification Report:\n", classification_report(y_test, pred))

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Balanced Accuracy: 0.9238160574367472
F1 Score (macro): 0.877634287853266

Classification Report:
               precision    recall  f1-score   support

         1.0       0.98      0.93      0.95       333
         2.0       0.77      0.88      0.82        64
         3.0       0.78      0.97      0.86        29

    accuracy                           0.92       426
   macro avg       0.84      0.92      0.88       426
weighted avg       0.93      0.92      0.93       426



In [None]:
best_xgb_model_pipeline=xgb_cv.best_estimator_

joblib.dump(xgb_cv.best_estimator_, 'XGB_model_pipeline')

joblib.dump(xgb_cv.best_estimator_['classifier'],'XGB_model')



['XGB_model']

logistic regression

In [17]:
from sklearn.linear_model import LogisticRegression


lr=LogisticRegression()

scoring = {
    'balanced_accuracy': 'balanced_accuracy',
    'f1_macro': 'f1_macro'
}


logistic_regression_grid = [
    {
        'classifier__penalty': ['l2'],
        'classifier__solver': ['lbfgs', 'newton-cg', 'sag', 'saga'],
        'classifier__C': np.logspace(-4, 4, 10),
        'feature_selection__k': [20, 22, 24, 26, 28, 30],
        'classifier__max_iter': [1000, 2000]
    },
    {
        'classifier__penalty': ['l1'],
        'classifier__solver': ['saga'],
        'classifier__C': np.logspace(-4, 4, 10),
        'feature_selection__k': [20, 22, 24, 26, 28, 30],
        'classifier__max_iter': [1000, 2000]
    },
    {
        'classifier__penalty': ['none'],
        'classifier__solver': ['lbfgs', 'newton-cg', 'sag', 'saga'],
        'classifier__C': np.logspace(-4, 4, 10),
        'feature_selection__k': [20, 22, 24, 26, 28, 30],
        'classifier__max_iter': [1000, 2000]
    },
    {
        'classifier__penalty': ['elasticnet'],
        'classifier__solver': ['saga'],
        'classifier__C': np.logspace(-4, 4, 10),
        'feature_selection__k': [20, 22, 24, 26, 28, 30],
        'classifier__max_iter': [1000, 2000],
        'classifier__l1_ratio': [0, 0.25, 0.5, 0.75, 1]
    }
]

main_pipeline_logistic_regression = ImbPipeline([
    ('add_features', add_features_transformer),
    ('preprocess', preprocessing),
    ('smote', smote),
   ('feature_selection', SelectKBest(score_func=f_classif)),
    ('classifier', lr)
])


logistic_regression_cv = sklearn.model_selection.RandomizedSearchCV(
    main_pipeline_logistic_regression,
    logistic_regression_grid,
    scoring=scoring,
    n_jobs=-1,
    refit='f1_macro',
    verbose=True,
    return_train_score=True,
    cv=10
)
logistic_regression_cv.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [18]:
logistic_regression_cv.best_score_

np.float64(0.7996357275693647)

In [19]:
pred=logistic_regression_cv.best_estimator_.predict(X_test)
print(f'test balance accuracy score: {balanced_accuracy_score(y_test,pred)}')
print(f'test f1 score: {f1_score(y_test,pred,average='macro')}')
print("\nClassification Report:\n", classification_report(y_test, pred))

test balance accuracy score: 0.8434749404576992
test f1 score: 0.7581908831908831

Classification Report:
               precision    recall  f1-score   support

         1.0       0.98      0.86      0.91       333
         2.0       0.57      0.81      0.67        64
         3.0       0.58      0.86      0.69        29

    accuracy                           0.85       426
   macro avg       0.71      0.84      0.76       426
weighted avg       0.89      0.85      0.86       426



In [20]:
best_logistic_model=logistic_regression_cv.best_estimator_['classifier']

joblib.dump(logistic_regression_cv.best_estimator_, 'lr_pipeline')

joblib.dump(logistic_regression_cv.best_estimator_['classifier'],'lr_model')

['lr_model']

stack classifier

In [56]:
rf=joblib.load('/content/random_forest_model_pipeline')
gbm=joblib.load('/content/gradient_boosting_model_pipeline')
xgb=joblib.load('/content/XGB_model_pipeline')
lg=joblib.load('/content/lr_pipeline')

In [57]:
rf_best_params=rf['classifier'].get_params()
gbm_best_params=gbm['classifier'].get_params()
xgb_best_params=xgb['classifier'].get_params()
lg_best_params=lg['classifier'].get_params()

In [76]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost

models = [('rf', RandomForestClassifier(**rf_best_params)), ('gbn',GradientBoostingClassifier(**gbm_best_params)),
    ('xgb', xgboost.XGBClassifier(**xgb_best_params)),('logistic',LogisticRegression(**lg_best_params))]

stack = StackingClassifier(models, cv=5, stack_method='auto')

final_pipeline = ImbPipeline([
    ('add_features', add_features_transformer),
    ('preprocess', preprocessing),
    ('smote', smote),
    ('classifier', stack)
])

final_pipeline.fit(X_train,y_train)

In [77]:
pred=final_pipeline.predict(X_test)
print(f'test balance accuracy score: {balanced_accuracy_score(y_test,pred)}')
print(f'test f1 score: {f1_score(y_test,pred,average='macro')}')
print("\nClassification Report:\n", classification_report(y_test, pred))

test balance accuracy score: 0.9296187351144248
test f1 score: 0.9181241011610369

Classification Report:
               precision    recall  f1-score   support

         1.0       0.97      0.96      0.97       333
         2.0       0.85      0.86      0.85        64
         3.0       0.90      0.97      0.93        29

    accuracy                           0.95       426
   macro avg       0.91      0.93      0.92       426
weighted avg       0.95      0.95      0.95       426



In [78]:
#final model is my stacking classifier
#dumping pipeline and model
joblib.dump(final_pipeline, 'final_pipeline')

joblib.dump(final_pipeline['classifier'],'final_pipeline_model')

['final_pipeline_model']