In [None]:
import pandas as pd
import numpy as np

initDF = pd.read_csv("half_filled_phenotype_train.csv")
initDF.drop(["Unnamed: 0"], axis = 1, inplace = True)
initDF

# Doing the data preprocessing part first

we will:

1) drop id column
2) encode CD and UD as 1 and 2, respectively
3) scale data between -1 and 1
4) deal with problematic data (< / >)

In [None]:
idDF = initDF['PatientId']
initDF.drop('PatientId', axis=1, inplace=True)
initDF

In [None]:
initDF.groupby('Diag').size()

In [None]:
initDF.describe()

A 60/40 split.
seems to be a pretty balanced dataset? 

In [None]:
# Number of missing values in each column of training data
missing_val_count_by_column = (initDF.isnull().sum())
# missing_val_count_by_column
print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
# from keras.wrappers.scikit_learn import KerasClassifier
# from keras.models import Sequential
# from keras.layers import Dense, Activation, Dropout
# 
# from numpy.random import seed
# from tensorflow import set_random_seed

In [None]:
initDF.replace(to_replace="CD", value = 0, inplace=True )
initDF.replace(to_replace="UC", value = 1, inplace=True )

initDF

### we have a problem in here: 

one of the columns (CRP) has values such as "<1" and "<2". We should replace them with just integers, though this means saturation and can be important.

In [None]:
df = initDF.copy()
df.dtypes

In [None]:
string_types = []

for column in df.columns:
    if df[column].dtype == "object":
        print(column)
        string_types.append(column)
        df[column] = df[column].astype("string")

In [None]:
for column in string_types:
    df[column] = df[column].str.replace('<', '')
    df[column] = df[column].str.replace('>', '')

In [None]:
for column in string_types:
    if column in string_types:
        print(column)
        df[column] = df[column].astype("float64")

In [None]:
df.dtypes

In [None]:
# simple func to input numerical 

imputer = SimpleImputer(strategy = "mean") 
df = pd.DataFrame(imputer.fit_transform(df))

df.columns = initDF.columns

In [None]:
df.describe()

### I will additionally scale continuous variables from -1 to 1

In [None]:
continuous = ['Height', 'Weight', 'AgeAtDiag', 'Leu', 'Hb', 'Hb', 'CRP', 'ESR', "Fer", "B12",
             "Fol", "Alk", 'Alb']

scaler = StandardScaler()

for var in continuous:
    df[var] = df[var].astype('float64')
    df[var] = scaler.fit_transform(df[var].values.reshape(-1, 1))
    
df

# Do ML: Our data is preprocessed and we have already taken a holdoutDF

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
target = 'Diag'

X = df.drop([target], axis=1)
y = df[[target]]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

In [None]:
categorical_cols = []
numerical_cols = []

for column in X.columns:
    if X[column].dtype == "object":
        categorical_cols.append(column)
    elif X[column].dtype in ['int64', 'float64']:
        numerical_cols.append(column)      

cols = categorical_cols + numerical_cols
print(len(df.columns), len(cols))

In [None]:
# a function for rapid f1score prediction

scores = []

def get_f1_score(model):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    res = f1_score(y_test, preds, average="weighted")
    scores.append(res)
    
def clean_scores():
    scores.clear()

In [None]:
# importing all the models we will be using

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
# defining our models
RF = RandomForestClassifier()
LR = LogisticRegression()
KNN = KNeighborsClassifier()
SVM = SVC(C=0.0001)
DT = DecisionTreeClassifier()
BGDT = BaggingClassifier(DecisionTreeClassifier())
ADB = AdaBoostClassifier(DecisionTreeClassifier())
GBD = GradientBoostingClassifier()
EVCh = VotingClassifier(estimators=[('lr',LR),('rf',RF),('svm',SVM), ("gbd", GBD),("adb",ADB), 
                                  ("dt",DT), ("bgdt", BGDT)],voting='hard')
EVCs = VotingClassifier(estimators=[('lr',LR),('rf',RF), ("gbd", GBD),("adb",ADB), 
                                  ("dt",DT), ("bgdt", BGDT)],voting='soft')

models = [LR, KNN, SVM, DT, ADB , RF, BGDT, EVCh, EVCs, GBD]

In [None]:
for model in models:
    get_f1_score(model)

### OK, we finally got some fucking predictions.

I am losing my mind. Let's compute the F1 score now, I bet it is not good huh

In [None]:
scores

In [None]:
clean_scores()

In [None]:
holdoutDF

In [None]:
X_holdout = holdoutDF.drop([target], axis=1)
y_holdout = holdoutDF[[target]]

In [None]:
X_holdout

# Starting tuning our models

In [None]:
import pickle

## Let's start with the decision trees

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
results = []

In [None]:
param_grid = {
    "n_estimators": [10, 100],
    "max_depth": [3, 7, 9],
}

RF = RandomForestClassifier() 

GBD = GradientBoostingClassifier()

In [None]:
gridSearchModels = [RF]

for model in gridSearchModels:

    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    
    preds = best_model.predict(X_test)
    res = f1_score(y_test, preds, average = "weighted")
    print(res)

    print(f"Best Hyperparameters for {model}: {grid_search.best_params_}")

### other one for GBD

In [None]:
n_estimators = [10, 100, 1000]
learning_rate = [0.001, 0.01, 0.1]
subsample = [0.5, 0.7, 1.0]
max_depth = [3, 7, 9]

In [None]:
gridGBD = dict(learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, max_depth=max_depth)
cvGBD = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_searchGBD = GridSearchCV(GBD, gridGBD, n_jobs=-1, cv=cvGBD)
grid_searchGBD.fit(X_train, np.ravel(y_train,order='C'))
best_modelGBD = grid_searchGBD.best_estimator_

predsGBD = best_modelGBD.predict(X_test)

resGBD = f1_score(y_test, predsGBD, average = "weighted")

In [None]:
resGBD

In [None]:
pklGBDt = "GBDt.pkl"

with open(pklGBDt, "wb") as file:
    pickle.dump(best_modelGBD, file)
    
with open(pklGBDt, 'rb') as file:
    pkl_model = pickle.load(file)
    
pkl_model

### other one for RF

In [None]:
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']

In [None]:
gridRF = dict(n_estimators=n_estimators,max_features=max_features)
cvRF = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_searchRF = GridSearchCV(estimator=RF, param_grid=gridRF, n_jobs=-1, cv=cvRF, scoring='accuracy',error_score=0)
grid_searchRF.fit(X_train, np.ravel(y_train,order='C'))
best_modelRF = grid_searchRF.best_estimator_

predsRF = best_modelRF.predict(X_test)

resRF = f1_score(y_test, predsRF, average = "weighted")

In [None]:
resRF

## Trying a NN

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

In [None]:
def create_model(lyrs=[8], act='linear', opt='Adam', dr=0.0):
    
    model = Sequential()
    
    # create first hidden layer
    model.add(Dense(lyrs[0], input_dim=X_train.shape[1], activation=act))
    
    # create additional hidden layers
    for i in range(1,len(lyrs)):
        model.add(Dense(lyrs[i], activation=act))
    
    # add dropout, default is none
    model.add(Dropout(dr))
    
    # create output layer
    model.add(Dense(1, activation='softmax'))  # output layer
    
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    
    return model

In [None]:
NN = create_model()
print(model.summary())

In [None]:
training = NN.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=0)

In [None]:
nnPreds = NN.predict(X_test)
resNN = f1_score(y_test, nnPreds, average = "weighted")
resNN

## doing grid search

In [None]:
import matplotlib.pyplot as plt

In [None]:
# create model
model = KerasClassifier(build_fn=create_model, verbose=0)

# define the grid search parameters
batch_size = [16, 32, 64]
epochs = [50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)

# search the grid
grid = GridSearchCV(estimator=model, 
                    param_grid=param_grid,
                    cv=3,
                    verbose=2)  # include n_jobs=-1 if you are using CPU

grid_result = grid.fit(X_train, y_train)

In [None]:
NNGS = grid_result.best_estimator_
NNGSpreds = NNGS.predict(X_test)
resNNGS = f1_score(y_test, NNGSpreds, average = "weighted")
resNNGS

# XGB from big boys

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance
from sklearn.model_selection import learning_curve

In [None]:
xgb = XGBClassifier(n_estimators=1190, 
                    objective='binary:logistic', 
                    learning_rate = 0.01, 
                    subsample = 0.5, 
                    colsample_bytree = 0.25,
                    max_depth = 4
                    )

xgb.fit(X_train, y_train, verbose=100)
XGBpreds = xgb.predict(X_test)
resXGB = f1_score(y_test, XGBpreds, average = "weighted")
resXGB

In [None]:
feature_importance = xgb.feature_importances_
sorted_idx = np.argsort(feature_importance)[::-1]
features = X.columns

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importance[sorted_idx], y=np.array(features)[sorted_idx], palette='GnBu_r')
plt.xlabel("Importance")
plt.ylabel("Features")
plt.title("XGBoost Feature Importance")
plt.xticks(rotation='vertical')
plt.show()

## Tuning logistic regression

In [None]:
LRt = LogisticRegression()

solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01, 0.001]

In [None]:
gridLRt = dict(solver=solvers,penalty=penalty,C=c_values)
cvLRt = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_searchLRt = GridSearchCV(estimator=LRt, param_grid=gridLRt, n_jobs=-1, cv=cvLRt)
grid_searchLRt.fit(X_train, np.ravel(y_train,order='C'))
best_modelLRt = grid_searchLRt.best_estimator_

predsLRt = best_modelLRt.predict(X_test)

resLRt = f1_score(y_test, predsLRt, average = "weighted")

In [None]:
resLRt

## Creating and tuning a KR

In [None]:
from sklearn.linear_model import RidgeClassifier

In [None]:
RC = RidgeClassifier()
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

In [None]:
gridRC = dict(alpha = alpha)
cvRC = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_searchRC = GridSearchCV(estimator=RC, param_grid=gridRC, n_jobs=-1, cv=cvRC)
grid_searchRC.fit(X_train, np.ravel(y_train,order='C'))
best_modelRC = grid_searchRC.best_estimator_

predsRC = best_modelRC.predict(X_test)

resRC = f1_score(y_test, predsRC, average = "weighted")

In [None]:
resRC

## Tuning a KNN 

In [None]:
KNNt = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

In [None]:
gridKNNt = dict(n_neighbors = n_neighbors,weights = weights, metric = metric)
cvKNNt = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_searchKNNt = GridSearchCV(estimator=KNNt, param_grid=gridKNNt, n_jobs=-1, cv=cvKNNt)
grid_searchKNNt.fit(X_train, np.ravel(y_train,order='C'))
best_modelKNNt = grid_searchKNNt.best_estimator_

predsKNNt = best_modelKNNt.predict(X_test)

resKNNt = f1_score(y_test, predsKNNt, average = "weighted")

In [None]:
resKNNt

## Tuning SVM

No use, is bad

In [None]:
SVMt = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]

In [None]:
gridSVM = dict(kernel = kernel,C=C)
cvSVM = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_searchSVM = GridSearchCV(estimator=SVMt, param_grid=gridSVM, n_jobs=-1, cv=cvSVM)
grid_searchSVM.fit(X_train, np.ravel(y_train,order='C'))
best_modelSVM = grid_searchSVM.best_estimator_

predsSVM = best_modelSVM.predict(X_test)

resSVM = f1_score(y_test, predsSVM, average = "weighted")

In [None]:
resSVM

# Making predictions

In [None]:
testDF = pd.read_csv("half_filled_phenotype_test.csv")
testDF.drop(["Unnamed: 0"], axis = 1, inplace = True)
testDF

In [None]:
df1 = testDF.copy()
df1.dtypes

In [None]:
string_types = []

for column in df1.columns:
    if df1[column].dtype == "object":
        print(column)
        string_types.append(column)
        df1[column] = df1[column].astype("string")

In [None]:
for column in string_types:
    df1[column] = df1[column].str.replace('<', '')
    df1[column] = df1[column].str.replace('>', '')

for column in string_types:
    if column in string_types:
        print(column)
        df1[column] = df1[column].astype("float64")
df1.dtypes

In [None]:
# simple func to input numerical 

imputer = SimpleImputer(strategy = "mean") 
df1 = pd.DataFrame(imputer.fit_transform(df1))

df1.columns = testDF.columns

df1

In [None]:
continuous = ['Height', 'Weight', 'AgeAtDiag', 'Leu', 'Hb', 'Hb', 'CRP', 'ESR', "Fer", "B12",
             "Fol", "Alk", 'Alb']

scaler = StandardScaler()

for var in continuous:
    df1[var] = df1[var].astype('float64')
    df1[var] = scaler.fit_transform(df1[var].values.reshape(-1, 1))
    
df1

In [None]:
df1.drop("ID_new", axis = 1, inplace = True)

In [None]:
df1.shape

In [None]:
catboost_model

In [None]:
subm_preds = xgb.predict(df1)

In [None]:
submissionAmina = pd.DataFrame()
submissionAmina['PatientId'] = testDF["ID_new"]
submissionAmina['Diag'] = subm_preds
submissionAmina['Diag'] = submissionAmina['Diag'].astype("int")

In [None]:
submissionAmina.to_csv("subManual.csv", index = False)
submissionAmina