In [1]:
import numpy as np
import os, shutil
import pandas as pd
import modin.pandas as mod_pd
import matplotlib.pyplot as plt
%config Completer.use_jedi = False

In [28]:
def drop_missing(df):
    missing_thresh_col = len(df)*0.7
    df.dropna(axis=1, thresh=missing_thresh_col, inplace=True)
    return df

def to_category(df):
    cols = df.select_dtypes(include='object').columns
    for col in cols:
        df[col] = df[col].astype('category')
    return df

def copy_df(df):
    return df.copy()

In [29]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
one_hot = OneHotEncoder()

from sklearn.preprocessing import LabelEncoder
lab_encode = LabelEncoder()

from sklearn.model_selection import train_test_split

import xgboost as xgb

from sklearn.metrics import roc_auc_score, classification_report, matthews_corrcoef, confusion_matrix

In [30]:
def testing_for_eskd5y(data):
    df = pd.read_csv(data)
    df.query("eskd_intraining != 'eskd'", inplace=True)
    df.set_index('id', inplace=True)
    df_cleaned = (df.pipe(copy_df).pipe(drop_missing).pipe(to_category))
    imbalance_ratio = df_cleaned['eskd_5y'].value_counts().non_eskd / df_cleaned['eskd_5y'].value_counts().eskd
    X = df_cleaned.drop(['egfr.y', 'eskd_intraining', 'eskd_2y', 'eskd_5y'], axis=1)
    y = df_cleaned['eskd_5y'].values
    y = y.map({'non_eskd':'no', 'eskd': 'yes'})
    category_cols = list(X.select_dtypes(include='category').columns)
    transformer = ColumnTransformer([('one_hot', one_hot, category_cols)],remainder='passthrough')
    X_transformed = transformer.fit_transform(X)
    y_transformed = lab_encode.fit_transform(y)
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_transformed, stratify=y_transformed,
                                                   test_size=0.3, random_state=7)
    xgb_class = xgb.XGBClassifier(n_jobs=-1, random_state = 7, n_estimators=500, scale_pos_weight=imbalance_ratio)
    xgb_class.fit(X=X_train, y=y_train)
    y_pred = xgb_class.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(matthews_corrcoef(y_test, y_pred))

In [31]:
def testing_for_eskd2y(data):
    df = pd.read_csv(data)
    df.query("eskd_intraining != 'eskd'", inplace=True)
    df.set_index('id', inplace=True)
    df_cleaned = (df.pipe(copy_df).pipe(drop_missing).pipe(to_category))
    imbalance_ratio = df_cleaned['eskd_5y'].value_counts().non_eskd / df_cleaned['eskd_5y'].value_counts().eskd
    X = df_cleaned.drop(['egfr.y', 'eskd_intraining', 'eskd_2y', 'eskd_5y'], axis=1)
    y = df_cleaned['eskd_2y'].values
    y = y.map({'non_eskd':'no', 'eskd': 'yes'})
    category_cols = list(X.select_dtypes(include='category').columns)
    transformer = ColumnTransformer([('one_hot', one_hot, category_cols)],remainder='passthrough')
    X_transformed = transformer.fit_transform(X)
    y_transformed = lab_encode.fit_transform(y)
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_transformed, stratify=y_transformed,
                                                   test_size=0.3, random_state=7)
    xgb_class = xgb.XGBClassifier(n_jobs=-1, random_state = 7, n_estimators=500, scale_pos_weight=imbalance_ratio)
    xgb_class.fit(X=X_train, y=y_train)
    y_pred = xgb_class.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(matthews_corrcoef(y_test, y_pred))

In [23]:
datasets = ['train_ready_eff_2_2.gz','train_ready_eff_2_4.gz', 'train_ready_eff_2_6.gz', 'train_ready_eff_2_9.gz', 
            'train_ready_eff_2_12.gz','train_ready_eff_3_2.gz', 'train_ready_eff_3_4.gz', 'train_ready_eff_3_6.gz', 
            'train_ready_eff_3_9.gz', 'train_ready_eff_3_12.gz']

In [None]:
testing_for_eskd2y(datasets[1])