In [1]:
import pandas as pd
import numpy as np
import os
from collections import Counter
import random


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, make_scorer, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from feature_generator import get_meds_feature, get_demo_features, get_lab_features, get_lab_array, get_df_array


In [2]:
data_loc = './data'

lab_df_list = ['T_SBP.csv', 'T_creatinine.csv', 'T_HGB.csv', 'T_ldl.csv', 'T_glucose.csv', 'T_DBP.csv']
train_id_df = 'train.csv'
test_id_df = 'test.csv'
label_df = 'T_stage.csv'
demo_df = 'T_demo.csv'
med_df = 'T_meds.csv'


In [3]:
# Read CSVs

df_label = pd.read_csv(os.path.join(data_loc, label_df))

df_train_ids = pd.read_csv(os.path.join(data_loc, train_id_df))
df_test_ids = pd.read_csv(os.path.join(data_loc, test_id_df))

df_demo = pd.read_csv(os.path.join(data_loc, demo_df))

df_meds = pd.read_csv(os.path.join(data_loc, med_df))
df_label = pd.read_csv(os.path.join(data_loc, label_df))


In [4]:
# Display train test stats

df_label.rename(columns = {'id': 'pid'}, inplace=True)
train_ids = df_train_ids.id.tolist()
test_ids = df_test_ids.id.tolist()
print('num train:{}, num_test:{}'.format(len(train_ids), len(test_ids)))

train_count = Counter(df_label[df_label.pid.isin(train_ids)]['Stage_Progress'].tolist())
test_count = Counter(df_label[df_label.pid.isin(test_ids)]['Stage_Progress'].tolist())

print('Train distribution: {} \nTest distribution: {}'.format(train_count, test_count))

num train:240, num_test:60
Train distribution: Counter({False: 159, True: 81}) 
Test distribution: Counter({False: 41, True: 19})


In [5]:
# Generate lab features

#feature_list = ['val_1st','val_last','time_1st','time_last','val_avg','val_median','val_max','val_min','weighted_average']
feature_list = ['weighted_average','last_minus_1st']
df_train_lab, df_test_lab = get_lab_features(data_loc, lab_df_list, df_label, train_ids, test_ids)

X_train_lab, Y_train_lab, pid_lab_train, lab_header_train = get_lab_array(df_train_lab, feature_list, 
                                                                          feature_header=[])
X_test_lab, Y_test_lab, pid_lab_test, lab_header_test = get_lab_array(df_test_lab, feature_list)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.sort_values(["time"], inplace = True)


(240, 12)
(array([0, 1]), array([159,  81]))
(60, 12)
(array([0, 1]), array([41, 19]))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1.sort_values(["pid"], inplace = True)


In [6]:
# Generate demo features

feature_list_demo = ['age','gender', 'race']

df_demo.rename(columns = {'id': 'pid'}, inplace = True)
df_train_demo, df_test_demo = get_demo_features(df_demo, train_ids, test_ids)
#df_train_demo.columns()

X_train_demo, demo_header_train, pid_demo_train = get_df_array(df_train_demo)

X_test_demo, demo_header_test, pid_demo_test = get_df_array(df_test_demo)

print(X_train_demo.shape, X_test_demo.shape)

(240, 7) (60, 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = out_list


In [7]:
assert (pid_demo_test == pid_lab_test).all()
assert (pid_demo_train == pid_lab_train).all()

In [8]:
# Generate meds features

from feature_generator import get_meds_feature

pid_list = df_label.pid.unique().tolist()
df_train_meds, df_test_meds = get_meds_feature(df_meds, pid_list, train_ids, test_ids)

X_train_meds, meds_header_train, pid_meds_train = get_df_array(df_train_meds)

X_test_meds, meds_header_test, pid_meds_test = get_df_array(df_test_meds)

# df_train_meds.columns()

In [9]:
X_train_meds.shape, X_test_meds.shape, len(meds_header_train), len(meds_header_test), pid_meds_train.shape, pid_meds_test.shape


((240, 8), (60, 8), 8, 8, (240,), (60,))

#### Lab Longitudinal Features

In [10]:
X_train = X_train_lab
X_test = X_test_lab

Y_train = Y_train_lab
Y_test = Y_test_lab

header_list = list(lab_header_train)

#### Lab Longitudinal + Demographic Features

In [11]:
# only lab and demo features
assert (pid_lab_train == pid_demo_train).all() 
assert (pid_lab_test == pid_lab_test).all() 

X_train = np.hstack((X_train_lab, X_train_demo))
X_test = np.hstack((X_test_lab, X_test_demo))

Y_train = Y_train_lab
Y_test = Y_test_lab

header_list = list(lab_header_train) + list(demo_header_train)

#### Lab Longitudinal + Demographic + Medication Features

In [12]:
# all data
assert (pid_lab_train == pid_demo_train).all() and ( pid_demo_train == pid_meds_train).all()
assert (pid_lab_test == pid_lab_test).all() and (pid_lab_test == pid_meds_test).all()

X_train = np.hstack((X_train_lab, X_train_demo, X_train_meds))
X_test = np.hstack((X_test_lab, X_test_demo, X_test_meds))

Y_train = Y_train_lab
Y_test = Y_test_lab

header_list = list(lab_header_train) + list(demo_header_train) + list(meds_header_train)

In [13]:
X_train.shape, X_test.shape, len(header_list), Y_train.shape, Y_test.shape

((240, 27), (60, 27), 27, (240,), (60,))

## Model 1: Logistic Regression

In [14]:
random.seed(1)
sscale = StandardScaler()
X_train_scaled = sscale.fit_transform(X_train)
X_test_scaled = sscale.transform(X_test)

# clf = LogisticRegression(random_state=0, penalty='l1', solver='saga' , class_weight = 'balanced', l1_ratio=0.3)
# clf = LogisticRegression(random_state=42, class_weight = 'balanced')
clf = LogisticRegression(random_state=42, class_weight = {0:0.5, 1:1.75})
# clf = LogisticRegression(random_state=0, class_weight = 'balanced',penalty='elasticnet', solver='saga' , l1_ratio = 0.2)

model = clf.fit(X_train_scaled, Y_train)

y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

print(classification_report(Y_train, y_train_pred))
print(classification_report(Y_test, y_test_pred))


              precision    recall  f1-score   support

           0       0.93      0.73      0.82       159
           1       0.63      0.89      0.73        81

    accuracy                           0.78       240
   macro avg       0.78      0.81      0.78       240
weighted avg       0.83      0.78      0.79       240

              precision    recall  f1-score   support

           0       0.88      0.73      0.80        41
           1       0.58      0.79      0.67        19

    accuracy                           0.75        60
   macro avg       0.73      0.76      0.73        60
weighted avg       0.79      0.75      0.76        60



## Model 2: Random Forest

In [15]:
random.seed(0)

# sample_scorer = make_scorer(recall_score)
sample_scorer = make_scorer(balanced_accuracy_score)

print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# max_features = ['auto', 'sqrt']
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]

n_estimators = [10, 50, 75]
max_depth = [3,5,10,15]
max_depth.append(None)
min_samples_split = [5, 10]
min_samples_leaf = [2,5]
bootstrap = [True]
class_weight=[{0:0.5, 1:1.75}, {0:0.2, 1:0.8}]
oob_score = [True]

random_grid = {'n_estimators': n_estimators,
                #'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap, 
              'class_weight': class_weight,
              'oob_score': oob_score}

rf = RandomForestClassifier()
# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
#                                 n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)

rf_random = GridSearchCV(estimator = rf, param_grid = random_grid, cv = 4, verbose=3, n_jobs = 10,
                        scoring = sample_scorer)
# # Fit the random search model
rf_random.fit(X_train, Y_train)
print(rf_random.best_params_)

(240, 27) (60, 27) (240,) (60,)
Fitting 4 folds for each of 120 candidates, totalling 480 fits
{'bootstrap': True, 'class_weight': {0: 0.2, 1: 0.8}, 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 75, 'oob_score': True}


In [17]:
rf_random.best_estimator_, rf_random.best_score_, rf_random.best_estimator_.oob_score_

(RandomForestClassifier(class_weight={0: 0.2, 1: 0.8}, max_depth=5,
                        min_samples_leaf=2, min_samples_split=10,
                        n_estimators=50, oob_score=True),
 0.7815819597069597,
 0.7625)

In [18]:
y_train_pred = rf_random.best_estimator_.predict(X_train)
y_test_pred = rf_random.best_estimator_.predict(X_test)

print(classification_report(Y_train, y_train_pred))
print(classification_report(Y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.99      0.83      0.90       159
           1       0.75      0.98      0.84        81

    accuracy                           0.88       240
   macro avg       0.87      0.90      0.87       240
weighted avg       0.90      0.88      0.88       240

              precision    recall  f1-score   support

           0       0.93      0.61      0.74        41
           1       0.52      0.89      0.65        19

    accuracy                           0.70        60
   macro avg       0.72      0.75      0.69        60
weighted avg       0.80      0.70      0.71        60



In [19]:
rf_random.best_estimator_.feature_importances_

array([2.84121935e-02, 7.22736604e-02, 2.40736193e-02, 2.52712827e-02,
       5.12061966e-02, 1.25101252e-01, 2.70792889e-02, 4.27508037e-02,
       4.58579951e-02, 1.05567643e-01, 1.07920660e-01, 2.86136646e-01,
       2.62792727e-02, 9.89387994e-03, 7.83653830e-04, 9.31098540e-04,
       0.00000000e+00, 6.47722463e-04, 9.07823279e-04, 7.71492560e-05,
       1.30448693e-03, 2.49429178e-03, 1.00773656e-03, 1.16419436e-18,
       7.39116075e-03, 2.36881922e-03, 4.26166347e-03])

In [20]:
importances = rf_random.best_estimator_.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_random.best_estimator_.estimators_], axis=0)

final_feat_imp_df = pd.DataFrame(columns = ['feature', 'feature_importance'])
for i, j in zip(header_list, std):
    final_feat_imp_df = final_feat_imp_df.append({'feature': i, 'feature_importance':j},ignore_index=True)
final_feat_imp_df.sort_values(by='feature_importance', ascending=False, inplace = True)

final_feat_imp_df

Unnamed: 0,feature,feature_importance
11,last_minus_1st_ldl,0.2031977
5,last_minus_1st_SBP,0.1102881
10,weighted_average_ldl,0.1074928
9,last_minus_1st_glucose,0.101952
1,last_minus_1st_DBP,0.07860122
4,weighted_average_SBP,0.06740353
8,weighted_average_glucose,0.0569298
2,weighted_average_HGB,0.05217304
3,last_minus_1st_HGB,0.04698495
7,last_minus_1st_creatinine,0.04467773


In [21]:
sub_final_feat_imp_df = final_feat_imp_df[final_feat_imp_df['feature_importance']> 0.1]
for ind, row in sub_final_feat_imp_df.iterrows():
    print('{},{}'.format(row['feature'], round(row['feature_importance']*100,2)))
    
sub_final_feat_imp_df['feature'].tolist()

last_minus_1st_ldl,20.32
last_minus_1st_SBP,11.03
weighted_average_ldl,10.75
last_minus_1st_glucose,10.2


['last_minus_1st_ldl',
 'last_minus_1st_SBP',
 'weighted_average_ldl',
 'last_minus_1st_glucose']

In [22]:
y_test_pred

array([1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1])

In [23]:
pd.DataFrame(y_test_pred).to_csv('y_test_pred.csv')

## Model 3: SVM

In [24]:
random.seed(0)

sscale = StandardScaler()
X_train_scaled = sscale.fit_transform(X_train)
X_test_scaled = sscale.transform(X_test)

sample_scorer = make_scorer(balanced_accuracy_score)
# sample_scorer = make_scorer(recall_score)

print(X_train_scaled.shape, X_test_scaled.shape, Y_train.shape, Y_test.shape)

param_grid = {'C': [50,100,150,200], 'gamma': [0.1,0.01, 0.02, 0.05, 0.07],'kernel': ['rbf', 'poly', 'sigmoid'], 
             'class_weight':['balanced', {0:0.5, 1:1.75}]}

base_estimator = SVC()

sh = GridSearchCV(estimator = base_estimator, param_grid = param_grid, cv = 4, verbose=3, n_jobs = 10, scoring = sample_scorer)
# # Fit the random search model
sh.fit(X_train, Y_train)
print(sh.best_params_)

(240, 27) (60, 27) (240,) (60,)
Fitting 4 folds for each of 120 candidates, totalling 480 fits
{'C': 50, 'class_weight': 'balanced', 'gamma': 0.1, 'kernel': 'poly'}


In [25]:
sh.best_estimator_, sh.best_score_

(SVC(C=50, class_weight='balanced', gamma=0.1, kernel='poly'),
 0.6172046703296703)

In [26]:
random.seed(0)

sscale = StandardScaler()
X_train_scaled = sscale.fit_transform(X_train)
X_test_scaled = sscale.transform(X_test)

sample_scorer = make_scorer(balanced_accuracy_score)
# sample_scorer = make_scorer(recall_score)

print(X_train_scaled.shape, X_test_scaled.shape, Y_train.shape, Y_test.shape)

sh = SVC(C=100, class_weight={0: 0.5, 1: 1.75}, gamma=0.01)
sh.fit(X_train_scaled, Y_train)

y_train_pred = sh.predict(X_train_scaled)
y_test_pred = sh.predict(X_test_scaled)

print(classification_report(Y_train, y_train_pred))
print(classification_report(Y_test, y_test_pred))

(240, 27) (60, 27) (240,) (60,)
              precision    recall  f1-score   support

           0       1.00      0.91      0.95       159
           1       0.85      1.00      0.92        81

    accuracy                           0.94       240
   macro avg       0.93      0.96      0.94       240
weighted avg       0.95      0.94      0.94       240

              precision    recall  f1-score   support

           0       0.80      0.78      0.79        41
           1       0.55      0.58      0.56        19

    accuracy                           0.72        60
   macro avg       0.68      0.68      0.68        60
weighted avg       0.72      0.72      0.72        60

