<a href="https://colab.research.google.com/github/chiranjeet14/ML_Journey/blob/master/Hackerearth-Predict-the-genetic-disorders/2_genetic_testing_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import io
import gc
import time
from pprint import pprint
from datetime import date

# settings
import warnings
warnings.filterwarnings("ignore")
gc.enable()

In [None]:
# !pip3 install xgboost > /dev/null
!pip3 install tune-sklearn ray[tune] > /dev/null

In [None]:
# Global Variables
random_state = 50

In [None]:
# connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
gDrivePath = '/content/drive/MyDrive/Datasets/Hackerearth_genetic_testing/dataset/'

In [None]:
df_train = pd.read_csv(gDrivePath+'train_preprocessed.csv')
df_test = pd.read_csv(gDrivePath+'test_preprocessed.csv')

In [None]:
df_train.shape

(18047, 32)

In [None]:
df_test.shape

(9465, 30)

In [None]:
df_train.sample(3)

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Mother's age,Father's age,Status,Respiratory Rate (breaths/min),Heart Rate (rates/min,Follow-up,Gender,Birth asphyxia,Autopsy shows birth defect (if applicable),Folic acid details (peri-conceptional),H/O serious maternal illness,H/O radiation exposure (x-ray),H/O substance abuse,Assisted conception IVF/ART,History of anomalies in previous pregnancies,No. of previous abortion,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
17817,7.0,1.0,1.0,1.0,0.0,5.004558,35.0,33.0,0.0,1.0,1.0,1.0,2.0,1.0,3.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,5.167524,3.0,1.0,1.0,1.0,1.0,0.0,Single-gene inheritance diseases,Cystic fibrosis
12699,1.0,1.0,1.0,0.591013,0.0,5.261566,34.581211,46.0,0.0,1.0,2.0,1.0,2.0,1.0,3.0,1.515392,1.0,2.0,0.0,2.0,1.0,4.0,1.0,8.723471,3.0,0.0,1.0,1.0,0.0,0.473209,Mitochondrial genetic inheritance disorders,Mitochondrial myopathy
16746,12.0,1.0,1.0,1.0,0.0,5.049086,26.0,41.915359,1.0,1.514574,2.0,1.513893,1.0,3.0,2.23185,1.0,2.0,2.248471,3.0,2.0,2.0,3.186316,1.0,6.519981,4.0,0.0,1.0,0.533342,1.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome


### Checking if the dataset is balanced/imbalanced - Genetic Disorder

In [None]:
target_count = df_train['Genetic Disorder'].value_counts()
target_count

Mitochondrial genetic inheritance disorders     9241
Single-gene inheritance diseases                6929
Multifactorial genetic inheritance disorders    1877
Name: Genetic Disorder, dtype: int64

### Checking if the dataset is balanced/imbalanced - Disorder Subclass

In [None]:
target_count = df_train['Disorder Subclass'].value_counts()
target_count

Leigh syndrome                         4683
Mitochondrial myopathy                 3971
Cystic fibrosis                        3145
Tay-Sachs                              2556
Diabetes                               1653
Hemochromatosis                        1228
Leber's hereditary optic neuropathy     587
Alzheimer's                             133
Cancer                                   91
Name: Disorder Subclass, dtype: int64

### Splitting Data into train-cv

In [None]:
genetic_disorder_labels = df_train['Genetic Disorder'].values
disorder_subclass_labels = df_train['Disorder Subclass'].values

df_train.drop(['Genetic Disorder','Disorder Subclass'], axis=1, inplace=True)
df_test.drop(['Genetic Disorder','Disorder Subclass'], axis=1, inplace=True, errors='ignore')

In [None]:
# classification split for genetic_disorder_labels
from sklearn.model_selection import train_test_split
X_train_genetic_disorder, X_cv_genetic_disorder, y_train_genetic_disorder, y_cv_genetic_disorder = train_test_split(df_train, genetic_disorder_labels, test_size=0.1, random_state=random_state)

# classification split for disorder_subclass_labels
X_train_disorder_subclass, X_cv_disorder_subclass, y_train_disorder_subclass, y_cv_disorder_subclass = train_test_split(df_train, disorder_subclass_labels, test_size=0.1, random_state=random_state)

### Over Sampling using SMOTE for Genetic Disorder

In [None]:
# https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
from imblearn.over_sampling import SMOTE
smote_overSampling = SMOTE()
X_train_genetic_disorder,y_train_genetic_disorder = smote_overSampling.fit_resample(X_train_genetic_disorder,y_train_genetic_disorder)
unique, counts = np.unique(y_train_genetic_disorder, return_counts=True)
dict(zip(unique, counts))

{'Mitochondrial genetic inheritance disorders': 8309,
 'Multifactorial genetic inheritance disorders': 8309,
 'Single-gene inheritance diseases': 8309}

### Over Sampling using SMOTE for Disorder Subclass

In [None]:
# https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
from imblearn.over_sampling import SMOTE
smote_overSampling = SMOTE()
X_train_disorder_subclass,y_train_disorder_subclass = smote_overSampling.fit_resample(X_train_disorder_subclass,y_train_disorder_subclass)
unique, counts = np.unique(y_train_disorder_subclass, return_counts=True)
dict(zip(unique, counts))

{"Alzheimer's": 4192,
 'Cancer': 4192,
 'Cystic fibrosis': 4192,
 'Diabetes': 4192,
 'Hemochromatosis': 4192,
 "Leber's hereditary optic neuropathy": 4192,
 'Leigh syndrome': 4192,
 'Mitochondrial myopathy': 4192,
 'Tay-Sachs': 4192}

### Scaling data : genetic_disorder

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_genetic_disorder_scaled = scaler.fit_transform(X_train_genetic_disorder)
X_cv_genetic_disorder_scaled = scaler.transform(X_cv_genetic_disorder)

X_test_scaled = scaler.transform(df_test)

# X_train_genetic_disorder_scaled

### Scaling data : disorder_subclass

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_disorder_subclass_scaled = scaler.fit_transform(X_train_disorder_subclass)
X_cv_disorder_subclass_scaled = scaler.transform(X_cv_disorder_subclass)

X_test_scaled = scaler.transform(df_test)

# X_train_disorder_subclass_scaled

### Modelling & Cross-Validation for genetic_disorder

In [None]:
# %%time
# # Train multiple models : https://www.kaggle.com/tflare/testing-multiple-models-with-scikit-learn-0-79425
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC, LinearSVC
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.ensemble import BaggingClassifier
# from sklearn.ensemble import ExtraTreesClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.linear_model import LogisticRegressionCV
# from xgboost import XGBClassifier

# from sklearn.model_selection import cross_val_score

# models = []

# # LogisticRegression = LogisticRegression(n_jobs=-1)
# # LinearSVC = LinearSVC()
# # KNeighbors = KNeighborsClassifier(n_jobs=-1)
# # DecisionTree = DecisionTreeClassifier()
# # AdaBoost = AdaBoostClassifier()
# # Bagging = BaggingClassifier()
# # GradientBoosting = GradientBoostingClassifier()
# # LogisticRegressionCV = LogisticRegressionCV(n_jobs=-1)
# # XGBClassifier = XGBClassifier(nthread=-1)
# RandomForest = RandomForestClassifier()
# ExtraTrees = ExtraTreesClassifier()

# # models.append(("LogisticRegression",LogisticRegression))
# # models.append(("LinearSVC", LinearSVC))
# # models.append(("KNeighbors", KNeighbors))
# # models.append(("DecisionTree", DecisionTree))
# # models.append(("AdaBoost", AdaBoost))
# # models.append(("Bagging", Bagging))
# # models.append(("GradientBoosting", GradientBoosting))
# # models.append(("LogisticRegressionCV", LogisticRegressionCV))
# # models.append(("XGBClassifier", XGBClassifier))
# models.append(("RandomForest", RandomForest))
# models.append(("ExtraTrees", ExtraTrees))

# # metric_names = ['f1', 'average_precision', 'accuracy', 'precision', 'recall']
# metric_names = ['f1_weighted']
# results = []
# names = []

# nested_dict = {}

# for name,model in models:
#   nested_dict[name] = {}
#   for metric in metric_names:
#     print("\nRunning : {}, with metric : {}".format(name, metric))
#     score = cross_val_score(model, X_train_genetic_disorder_scaled, y_train_genetic_disorder, n_jobs=-1, scoring=metric, cv=5)
#     nested_dict[name][metric] = score.mean()

In [None]:
# import json
# print(json.dumps(nested_dict, sort_keys=True, indent=4))

### Modelling & Cross-Validation for disorder_subclass

In [None]:
# %%time
# # Train multiple models : https://www.kaggle.com/tflare/testing-multiple-models-with-scikit-learn-0-79425
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC, LinearSVC
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.ensemble import BaggingClassifier
# from sklearn.ensemble import ExtraTreesClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.linear_model import LogisticRegressionCV
# from xgboost import XGBClassifier

# from sklearn.model_selection import cross_val_score

# models = []

# # LogisticRegression = LogisticRegression(n_jobs=-1)
# # LinearSVC = LinearSVC()
# # KNeighbors = KNeighborsClassifier(n_jobs=-1)
# # DecisionTree = DecisionTreeClassifier()
# # AdaBoost = AdaBoostClassifier()
# # Bagging = BaggingClassifier()
# # GradientBoosting = GradientBoostingClassifier()
# # LogisticRegressionCV = LogisticRegressionCV(n_jobs=-1)
# # XGBClassifier = XGBClassifier(nthread=-1)
# RandomForest = RandomForestClassifier()
# ExtraTrees = ExtraTreesClassifier()

# # models.append(("LogisticRegression",LogisticRegression))
# # models.append(("LinearSVC", LinearSVC))
# # models.append(("KNeighbors", KNeighbors))
# # models.append(("DecisionTree", DecisionTree))
# # models.append(("AdaBoost", AdaBoost))
# # models.append(("Bagging", Bagging))
# # models.append(("GradientBoosting", GradientBoosting))
# # models.append(("LogisticRegressionCV", LogisticRegressionCV))
# # models.append(("XGBClassifier", XGBClassifier))
# models.append(("RandomForest", RandomForest))
# models.append(("ExtraTrees", ExtraTrees))

# # metric_names = ['f1', 'average_precision', 'accuracy', 'precision', 'recall']
# metric_names = ['f1_weighted']
# results = []
# names = []

# nested_dict = {}

# for name,model in models:
#   nested_dict[name] = {}
#   for metric in metric_names:
#     print("\nRunning : {}, with metric : {}".format(name, metric))
#     score = cross_val_score(model, X_train_disorder_subclass_scaled, y_train_disorder_subclass, n_jobs=-1, scoring=metric, cv=5)
#     nested_dict[name][metric] = score.mean()

In [None]:
# import json
# print(json.dumps(nested_dict, sort_keys=True, indent=4))

### Hyperparameter tuning

#### Tuning for : genetic_disorder

In [None]:
# from sklearn.model_selection import GridSearchCV
from tune_sklearn import TuneGridSearchCV

from sklearn.ensemble import ExtraTreesClassifier

# model_classifier = ExtraTreesClassifier(max_depth=15, n_estimators=400)
model_classifier = ExtraTreesClassifier(criterion='gini', bootstrap=False, max_features='auto', warm_start=False)


# # Best Params: {'criterion': 'gini', 'max_depth': 15, 'bootstrap': False, 'max_features': 'auto', 'warm_start': False, 'n_estimators': 400}
# Parameters to tune:
parameters = {
    'n_estimators': np.arange(100, 3000, 100, dtype=int),
    'max_depth': np.arange(5, 16, 1, dtype=int),
    # 'criterion': ['gini', 'entropy'],
    # 'bootstrap': [True, False],
    # 'max_features': ['auto', 'sqrt', 'log2'],
    # 'warm_start': [True, False],
}
tune_search_genetic_disorder = TuneGridSearchCV(
    model_classifier,
    parameters,
    scoring='f1_weighted',
    verbose=1,
    n_jobs=-1,
)

In [None]:
tune_search_genetic_disorder.fit(X_train_genetic_disorder_scaled, y_train_genetic_disorder)

pred = tune_search_genetic_disorder.predict(X_cv_genetic_disorder_scaled)
accuracy = np.count_nonzero(np.array(pred) == np.array(y_cv_genetic_disorder)) / len(pred)
print("Tune Accuracy:", accuracy)

In [None]:
print("Best Params:", tune_search_genetic_disorder.best_params_)

#### Tuning for : disorder_subclass

In [None]:
# from sklearn.model_selection import GridSearchCV
from tune_sklearn import TuneGridSearchCV

from sklearn.ensemble import ExtraTreesClassifier

# model_classifier = ExtraTreesClassifier(max_depth=15, n_estimators=400)
model_classifier = ExtraTreesClassifier(criterion='entropy', bootstrap=False, max_features='log2', warm_start=True)

## Best Params: {'criterion': 'entropy', 'max_depth': 15, 'bootstrap': False, 'max_features': 'log2', 'warm_start': True}
# Parameters to tune:
parameters = {
    'n_estimators': np.arange(100, 3000, 100, dtype=int),
    'max_depth': np.arange(5, 16, 1, dtype=int),
    # 'criterion': ['gini', 'entropy'],
    # 'bootstrap': [True, False],
    # 'max_features': ['auto', 'sqrt', 'log2'],
    # 'warm_start': [True, False],
}
tune_search_disorder_subclass = TuneGridSearchCV(
    model_classifier,
    parameters,
    scoring='f1_weighted',
    verbose=1,
    n_jobs=-1,
)

In [None]:
tune_search_disorder_subclass.fit(X_train_disorder_subclass_scaled, y_train_disorder_subclass)

In [None]:
# Check accuracy
pred = tune_search_disorder_subclass.predict(X_cv_disorder_subclass_scaled)
accuracy = np.count_nonzero(np.array(pred) == np.array(y_cv_disorder_subclass)) / len(pred)
print("Tune Accuracy:", accuracy)

In [None]:
print("Best Params:", tune_search_disorder_subclass.best_params_)

In [None]:
import joblib
joblib.dump(tune_search_genetic_disorder, 'genetic_disorder_model.pkl')

In [None]:
joblib.dump(tune_search_disorder_subclass, 'disorder_subclass_model.pkl')

In [None]:
trained_model_genetic_disorder = joblib.load('genetic_disorder_model.pkl')
trained_model_disorder_subclass = joblib.load('disorder_subclass_model.pkl')

### Predicting on CV data

In [None]:
# 

### Predicting on test Data

In [None]:
predictions_genetic_disorder_test = trained_model_genetic_disorder.predict(X_test_scaled)
predictions_disorder_subclass_test = trained_model_disorder_subclass.predict(X_test_scaled)

In [None]:
len(predictions_genetic_disorder_test)

In [None]:
len(predictions_disorder_subclass_test)

In [None]:
read = pd.read_csv(gDrivePath + 'test.csv')
read.shape

In [None]:
submission = pd.DataFrame({
        "Patient Id": read["Patient Id"],
        "Genetic Disorder": predictions_genetic_disorder_test,
        "Disorder Subclass": predictions_disorder_subclass_test,
    })

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)