In [1]:
import pandas as pd
import numpy as np

initDF = pd.read_csv("Clinical_Data.train.txt", sep = "\t")
initDF

Unnamed: 0,PatientId,Gender,Height,Weight,Diag,AgeAtDiag,SmokeAtDiag,Leu,Hb,CRP,ESR,Fer,B12,Fol,Alk,Alb
0,1,2,165.0,59.0,CD,16.0,0,5.5,120.0,20,,,,,61.0,
1,3,2,,,UC,31.0,0,7.6,134.0,5,8.0,53,,,43.0,
2,4,1,174.0,67.0,CD,31.0,0,,103.0,40,,226,299,7.9,69.0,35.0
3,5,2,172.0,60.0,UC,56.0,1,13.6,104.0,23,26.0,,,,87.0,32.0
4,6,1,182.0,74.0,CD,22.0,1,5.7,150.0,2,,,,,,44.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1985,2026,1,175.0,72.0,CD,25.0,1,6.3,153.0,2.3,,33,258,,52.0,42.0
1986,2027,1,182.0,77.0,CD,26.0,-1,9.9,166.0,,,,,,36.0,
1987,2028,1,170.0,68.0,CD,22.0,0,4.3,115.0,2,10.0,24,161,8.8,45.0,
1988,2029,1,,,CD,16.0,-1,10.3,143.0,15,,49,,,,37.0


# Doing the data preprocessing part first

we will:

1) drop id column
2) encode CD and UD as 1 and 2, respectively
3) scale data between -1 and 1
4) deal with problematic data (< / >)

In [2]:
idDF = initDF['PatientId']
initDF.drop('PatientId', axis=1, inplace=True)
initDF

Unnamed: 0,Gender,Height,Weight,Diag,AgeAtDiag,SmokeAtDiag,Leu,Hb,CRP,ESR,Fer,B12,Fol,Alk,Alb
0,2,165.0,59.0,CD,16.0,0,5.5,120.0,20,,,,,61.0,
1,2,,,UC,31.0,0,7.6,134.0,5,8.0,53,,,43.0,
2,1,174.0,67.0,CD,31.0,0,,103.0,40,,226,299,7.9,69.0,35.0
3,2,172.0,60.0,UC,56.0,1,13.6,104.0,23,26.0,,,,87.0,32.0
4,1,182.0,74.0,CD,22.0,1,5.7,150.0,2,,,,,,44.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1985,1,175.0,72.0,CD,25.0,1,6.3,153.0,2.3,,33,258,,52.0,42.0
1986,1,182.0,77.0,CD,26.0,-1,9.9,166.0,,,,,,36.0,
1987,1,170.0,68.0,CD,22.0,0,4.3,115.0,2,10.0,24,161,8.8,45.0,
1988,1,,,CD,16.0,-1,10.3,143.0,15,,49,,,,37.0


In [3]:
initDF.groupby('Diag').size()

Diag
CD    1190
UC     800
dtype: int64

In [4]:
initDF.describe()

Unnamed: 0,Gender,Height,Weight,AgeAtDiag,SmokeAtDiag,Leu,Hb,ESR,Alk,Alb
count,1990.0,1970.0,1973.0,1987.0,1990.0,1899.0,1906.0,787.0,1529.0,1214.0
mean,1.486432,170.87665,70.327927,30.610468,0.347739,7.562101,135.305352,16.030877,73.110857,39.584185
std,0.499942,8.83909,15.026495,13.598222,0.529362,4.487681,16.488098,15.93844,50.166718,6.031602
min,1.0,143.0,35.0,0.0,-1.0,1.4,60.0,0.0,0.9,13.0
25%,1.0,164.0,60.0,20.0,0.0,5.5,126.0,6.0,52.0,36.0
50%,1.0,170.0,69.0,27.0,0.0,7.0,136.0,11.0,65.0,40.0
75%,2.0,178.0,79.0,38.0,1.0,9.0,147.0,20.0,82.0,43.0
max,2.0,200.0,143.0,81.0,1.0,131.0,190.0,105.0,905.0,72.0


A 60/40 split.
seems to be a pretty balanced dataset? 

In [5]:
# Number of missing values in each column of training data
missing_val_count_by_column = (initDF.isnull().sum())
# missing_val_count_by_column
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Height         20
Weight         17
AgeAtDiag       3
Leu            91
Hb             84
CRP           188
ESR          1203
Fer           710
B12           870
Fol          1018
Alk           461
Alb           776
dtype: int64


In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
# from keras.wrappers.scikit_learn import KerasClassifier
# from keras.models import Sequential
# from keras.layers import Dense, Activation, Dropout
# 
# from numpy.random import seed
# from tensorflow import set_random_seed

In [7]:
initDF.replace(to_replace="CD", value = 0, inplace=True )
initDF.replace(to_replace="UC", value = 1, inplace=True )

initDF

Unnamed: 0,Gender,Height,Weight,Diag,AgeAtDiag,SmokeAtDiag,Leu,Hb,CRP,ESR,Fer,B12,Fol,Alk,Alb
0,2,165.0,59.0,0,16.0,0,5.5,120.0,20,,,,,61.0,
1,2,,,1,31.0,0,7.6,134.0,5,8.0,53,,,43.0,
2,1,174.0,67.0,0,31.0,0,,103.0,40,,226,299,7.9,69.0,35.0
3,2,172.0,60.0,1,56.0,1,13.6,104.0,23,26.0,,,,87.0,32.0
4,1,182.0,74.0,0,22.0,1,5.7,150.0,2,,,,,,44.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1985,1,175.0,72.0,0,25.0,1,6.3,153.0,2.3,,33,258,,52.0,42.0
1986,1,182.0,77.0,0,26.0,-1,9.9,166.0,,,,,,36.0,
1987,1,170.0,68.0,0,22.0,0,4.3,115.0,2,10.0,24,161,8.8,45.0,
1988,1,,,0,16.0,-1,10.3,143.0,15,,49,,,,37.0


### we have a problem in here: 

one of the columns (CRP) has values such as "<1" and "<2". We should replace them with just integers, though this means saturation and can be important.

In [8]:
df = initDF.copy()
df.dtypes

Gender           int64
Height         float64
Weight         float64
Diag             int64
AgeAtDiag      float64
SmokeAtDiag      int64
Leu            float64
Hb             float64
CRP             object
ESR            float64
Fer             object
B12             object
Fol             object
Alk            float64
Alb            float64
dtype: object

In [9]:
string_types = []

for column in df.columns:
    if df[column].dtype == "object":
        print(column)
        string_types.append(column)
        df[column] = df[column].astype("string")

CRP
Fer
B12
Fol


In [10]:
for column in string_types:
    df[column] = df[column].str.replace('<', '')
    df[column] = df[column].str.replace('>', '')

In [11]:
for column in string_types:
    if column in string_types:
        print(column)
        df[column] = df[column].astype("float64")

CRP
Fer
B12
Fol


In [12]:
df.dtypes

Gender           int64
Height         float64
Weight         float64
Diag             int64
AgeAtDiag      float64
SmokeAtDiag      int64
Leu            float64
Hb             float64
CRP            float64
ESR            float64
Fer            float64
B12            float64
Fol            float64
Alk            float64
Alb            float64
dtype: object

In [13]:
# simple func to input numerical 

imputer = SimpleImputer(strategy = "mean") 
df = pd.DataFrame(imputer.fit_transform(df))

df.columns = initDF.columns

In [14]:
df.describe()

Unnamed: 0,Gender,Height,Weight,Diag,AgeAtDiag,SmokeAtDiag,Leu,Hb,CRP,ESR,Fer,B12,Fol,Alk,Alb
count,1990.0,1990.0,1990.0,1990.0,1990.0,1990.0,1990.0,1990.0,1990.0,1990.0,1990.0,1990.0,1990.0,1990.0,1990.0
mean,1.486432,170.87665,70.327927,0.40201,30.610468,0.347739,7.562101,135.305352,11.129528,16.030877,200.458145,430.466474,95.977654,73.110857,39.584185
std,0.499942,8.794538,14.962141,0.490427,13.587963,0.529362,4.383819,16.136177,22.070694,10.019353,717.697765,583.503557,237.13728,43.970355,4.710269
min,1.0,143.0,35.0,0.0,0.0,-1.0,1.4,60.0,0.0,0.0,0.0,0.0,0.0,0.9,13.0
25%,1.0,165.0,60.0,0.0,20.0,0.0,5.6,126.0,2.0,15.0,50.0,226.0,15.2,56.0,39.0
50%,1.0,170.87665,69.0,0.0,27.0,0.0,7.1,136.0,5.0,16.030877,143.0,430.466474,95.977654,73.0,39.584185
75%,2.0,177.0,79.0,1.0,38.0,1.0,8.9,146.0,11.129528,16.030877,200.458145,430.466474,95.977654,75.0,41.0
max,2.0,200.0,143.0,1.0,81.0,1.0,131.0,190.0,287.0,105.0,18090.0,11070.0,3466.0,905.0,72.0


In [15]:
continuous = ['Height', 'Weight', 'AgeAtDiag', 'Leu', 'Hb', 'Hb', 'CRP', 'ESR', "Fer", "B12",
             "Fol", "Alk", 'Alb']

scaler = StandardScaler()

for var in continuous:
    df[var] = df[var].astype('float64')
    df[var] = scaler.fit_transform(df[var].values.reshape(-1, 1))
    
df

Unnamed: 0,Gender,Height,Weight,Diag,AgeAtDiag,SmokeAtDiag,Leu,Hb,CRP,ESR,Fer,B12,Fol,Alk,Alb
0,2.0,-6.683838e-01,-7.572963e-01,0.0,-1.075521,0.0,-4.705074e-01,-0.948750,4.020128e-01,0.000000,-3.961118e-17,-9.744192e-17,0.000000,-0.275502,-1.508876e-15
1,2.0,3.232558e-15,-9.500262e-16,1.0,0.028675,0.0,8.647350e-03,-0.080916,-2.777923e-01,-0.801738,-2.055116e-01,-9.744192e-17,0.000000,-0.684971,-1.508876e-15
2,1.0,3.552359e-01,-2.224791e-01,0.0,0.028675,0.0,2.026547e-16,-2.002548,1.308420e+00,0.000000,3.559754e-02,-2.253620e-01,-0.371514,-0.093515,-9.734766e-01
3,2.0,1.277648e-01,-6.904441e-01,1.0,1.869001,1.0,1.377661e+00,-1.940560,5.379738e-01,0.995237,-3.961118e-17,-9.744192e-17,0.000000,0.315955,-1.610543e+00
4,1.0,1.265120e+00,2.454860e-01,0.0,-0.633843,1.0,-4.248736e-01,0.910894,-4.137533e-01,0.000000,-3.961118e-17,-9.744192e-17,0.000000,0.000000,9.377225e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1985,1.0,4.689714e-01,1.117817e-01,0.0,-0.413004,1.0,-2.879722e-01,1.096858,-4.001572e-01,0.000000,-2.333855e-01,-2.956449e-01,0.000000,-0.480236,5.130116e-01
1986,1.0,1.265120e+00,4.460424e-01,0.0,-0.339391,-1.0,5.334359e-01,1.902704,8.050510e-17,0.000000,-3.961118e-17,-9.744192e-17,0.000000,-0.844209,-1.508876e-15
1987,1.0,-9.970621e-02,-1.556269e-01,0.0,-0.633843,0.0,-7.443101e-01,-1.258691,-4.137533e-01,-0.602074,-2.459287e-01,-4.619238e-01,-0.367718,-0.639475,-1.508876e-15
1988,1.0,3.232558e-15,-9.500262e-16,0.0,-1.075521,-1.0,6.247034e-01,0.476977,1.754111e-01,0.000000,-2.110864e-01,-9.744192e-17,0.000000,0.000000,-5.487657e-01


# Creating a random sample

randomly sample 5-20% of the data for further testing, work only with 80% of what is left

In [16]:
df

Unnamed: 0,Gender,Height,Weight,Diag,AgeAtDiag,SmokeAtDiag,Leu,Hb,CRP,ESR,Fer,B12,Fol,Alk,Alb
0,2.0,-6.683838e-01,-7.572963e-01,0.0,-1.075521,0.0,-4.705074e-01,-0.948750,4.020128e-01,0.000000,-3.961118e-17,-9.744192e-17,0.000000,-0.275502,-1.508876e-15
1,2.0,3.232558e-15,-9.500262e-16,1.0,0.028675,0.0,8.647350e-03,-0.080916,-2.777923e-01,-0.801738,-2.055116e-01,-9.744192e-17,0.000000,-0.684971,-1.508876e-15
2,1.0,3.552359e-01,-2.224791e-01,0.0,0.028675,0.0,2.026547e-16,-2.002548,1.308420e+00,0.000000,3.559754e-02,-2.253620e-01,-0.371514,-0.093515,-9.734766e-01
3,2.0,1.277648e-01,-6.904441e-01,1.0,1.869001,1.0,1.377661e+00,-1.940560,5.379738e-01,0.995237,-3.961118e-17,-9.744192e-17,0.000000,0.315955,-1.610543e+00
4,1.0,1.265120e+00,2.454860e-01,0.0,-0.633843,1.0,-4.248736e-01,0.910894,-4.137533e-01,0.000000,-3.961118e-17,-9.744192e-17,0.000000,0.000000,9.377225e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1985,1.0,4.689714e-01,1.117817e-01,0.0,-0.413004,1.0,-2.879722e-01,1.096858,-4.001572e-01,0.000000,-2.333855e-01,-2.956449e-01,0.000000,-0.480236,5.130116e-01
1986,1.0,1.265120e+00,4.460424e-01,0.0,-0.339391,-1.0,5.334359e-01,1.902704,8.050510e-17,0.000000,-3.961118e-17,-9.744192e-17,0.000000,-0.844209,-1.508876e-15
1987,1.0,-9.970621e-02,-1.556269e-01,0.0,-0.633843,0.0,-7.443101e-01,-1.258691,-4.137533e-01,-0.602074,-2.459287e-01,-4.619238e-01,-0.367718,-0.639475,-1.508876e-15
1988,1.0,3.232558e-15,-9.500262e-16,0.0,-1.075521,-1.0,6.247034e-01,0.476977,1.754111e-01,0.000000,-2.110864e-01,-9.744192e-17,0.000000,0.000000,-5.487657e-01


In [None]:
n_random_samples = int(np.ceil(len(df)/20))
holdoutDF = df.sample(n_random_samples)
df.drop(holdoutDF.index, axis=0, inplace=True)

holdoutDF

Sampled data is stored in holdoutDF. we will use it later for testing

# Do ML: Our data is preprocessed and we have already taken a holdoutDF

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")

In [19]:
target = 'Diag'

X = df.drop([target], axis=1)
y = df[[target]]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25,random_state=0)

In [20]:
categorical_cols = []
numerical_cols = []

for column in X.columns:
    if X[column].dtype == "object":
        categorical_cols.append(column)
    elif X[column].dtype in ['int64', 'float64']:
        numerical_cols.append(column)      

cols = categorical_cols + numerical_cols
print(len(df.columns), len(cols))

15 14


In [21]:
# a function for rapid f1score prediction

scores = []

def get_f1_score(model):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    res = f1_score(y_test, preds, average="weighted")
    scores.append(res)
    
def clean_scores():
    scores.clear()

In [22]:
# importing all the models we will be using

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [23]:
# defining our models
RF = RandomForestClassifier()
LR = LogisticRegression()
KNN = KNeighborsClassifier()
SVM = SVC(C=0.0001)
DT = DecisionTreeClassifier()
BGDT = BaggingClassifier(DecisionTreeClassifier())
ADB = AdaBoostClassifier(DecisionTreeClassifier())
GBD = GradientBoostingClassifier()
EVCh = VotingClassifier(estimators=[('lr',LR),('rf',RF),('svm',SVM), ("gbd", GBD),("adb",ADB), 
                                  ("dt",DT), ("bgdt", BGDT)],voting='hard')
EVCs = VotingClassifier(estimators=[('lr',LR),('rf',RF), ("gbd", GBD),("adb",ADB), 
                                  ("dt",DT), ("bgdt", BGDT)],voting='soft')

models = [LR, KNN, SVM, DT, ADB , BGDT, EVCh, EVCs, RF, GBD]

In [24]:
for model in models:
    get_f1_score(model)

### OK, we finally got some fucking predictions.

I am losing my mind. Let's compute the F1 score now, I bet it is not good huh

In [25]:
scores

[0.5588696881450835,
 0.6051623224545727,
 0.3882143359429938,
 0.5995640587723204,
 0.6014670191557597,
 0.6115605884262999,
 0.6120135012158804,
 0.6246849098521746,
 0.6531049105197436,
 0.6479534904278655]

In [None]:
clean_scores()

# Testing the holdout dataset

In [None]:
holdoutDF

In [None]:
X_holdout = holdoutDF.drop([target], axis=1)
y_holdout = holdoutDF[[target]]

In [None]:
X_holdout

In [None]:
predsHoldout = model.predict(X_holdout)
resHoldout = f1_score(y_holdout, predsHoldout, average = "weighted")
print(resHoldout)

# Starting tuning our models

In [None]:
import pickle

## Let's start with the decision trees

In [26]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [27]:
results = []

In [28]:
param_grid = {
    "n_estimators": [10, 100],
    "max_depth": [3, 7, 9],
}

RF = RandomForestClassifier() 

GBD = GradientBoostingClassifier()

In [106]:
gridSearchModels = [RF]

for model in gridSearchModels:

    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    
    preds = best_model.predict(X_test)
    res = f1_score(y_test, preds, average = "weighted")
    print(res)

    print(f"Best Hyperparameters for {model}: {grid_search.best_params_}")

0.6380099633506704
Best Hyperparameters for RandomForestClassifier(): {'max_depth': 7, 'n_estimators': 100}


In [107]:
best_model

In [None]:
predsBMhold = best_model.predict(X_holdout)
resBMhold = f1_score(y_holdout, predsBMhold, average = "weighted")
resBMhold

### other one for GBD

In [30]:
n_estimators = [10, 100, 1000]
learning_rate = [0.001, 0.01, 0.1]
subsample = [0.5, 0.7, 1.0]
max_depth = [3, 7, 9]

In [31]:
gridGBD = dict(learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, max_depth=max_depth)
cvGBD = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_searchGBD = GridSearchCV(GBD, gridGBD, n_jobs=-1, cv=cvGBD)
grid_searchGBD.fit(X_train, np.ravel(y_train,order='C'))
best_modelGBD = grid_searchGBD.best_estimator_

predsGBD = best_modelGBD.predict(X_test)

resGBD = f1_score(y_test, predsGBD, average = "weighted")


KeyboardInterrupt



In [None]:
resGBD

In [None]:
predsGBDholdout = best_modelGBD.predict(X_holdout)
resGBDholdout = f1_score(y_holdout, predsGBDholdout, average = "weighted")
resGBDholdout

In [None]:
pklGBDt = "GBDt.pkl"

with open(pklGBDt, "wb") as file:
    pickle.dump(best_modelGBD, file)
    
with open(pklGBDt, 'rb') as file:
    pkl_model = pickle.load(file)
    
pkl_model

### other one for RF

In [32]:
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']

In [33]:
gridRF = dict(n_estimators=n_estimators,max_features=max_features)
cvRF = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_searchRF = GridSearchCV(estimator=RF, param_grid=gridRF, n_jobs=-1, cv=cvRF, scoring='accuracy',error_score=0)
grid_searchRF.fit(X_train, np.ravel(y_train,order='C'))
best_modelRF = grid_searchRF.best_estimator_

predsRF = best_modelRF.predict(X_test)

resRF = f1_score(y_test, predsRF, average = "weighted")

In [34]:
resRF

0.6347008170136419

In [35]:
predsRFholdout = best_modelRF.predict(X_holdout)
resRFholdout = f1_score(y_holdout, predsRFholdout, average = "weighted")
resRFholdout

NameError: name 'X_holdout' is not defined

## Tuning logistic regression

In [36]:
LRt = LogisticRegression()

solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01, 0.001]

In [37]:
gridLRt = dict(solver=solvers,penalty=penalty,C=c_values)
cvLRt = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_searchLRt = GridSearchCV(estimator=LRt, param_grid=gridLRt, n_jobs=-1, cv=cvLRt)
grid_searchLRt.fit(X_train, np.ravel(y_train,order='C'))
best_modelLRt = grid_searchLRt.best_estimator_

predsLRt = best_modelLRt.predict(X_test)

resLRt = f1_score(y_test, predsLRt, average = "weighted")

In [38]:
resLRt

0.5561972512292434

In [None]:
predsLRtholdout = best_modelLRt.predict(X_holdout)
resLRtholdout = f1_score(y_holdout, predsLRtholdout, average = "weighted")
resLRtholdout

## Creating and tuning a KR

In [39]:
from sklearn.linear_model import RidgeClassifier

In [40]:
RC = RidgeClassifier()
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

In [41]:
gridRC = dict(alpha = alpha)
cvRC = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_searchRC = GridSearchCV(estimator=RC, param_grid=gridRC, n_jobs=-1, cv=cvRC)
grid_searchRC.fit(X_train, np.ravel(y_train,order='C'))
best_modelRC = grid_searchRC.best_estimator_

predsRC = best_modelRC.predict(X_test)

resRC = f1_score(y_test, predsRC, average = "weighted")

In [42]:
res

0.6159341532309657

In [None]:
predsRCholdout = best_modelRC.predict(X_holdout)
resRCholdout = f1_score(y_holdout, predsRCholdout, average = "weighted")
resRCholdout

## Tuning a KNN 

In [43]:
KNNt = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

In [44]:
gridKNNt = dict(n_neighbors = n_neighbors,weights = weights, metric = metric)
cvKNNt = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_searchKNNt = GridSearchCV(estimator=KNNt, param_grid=gridKNNt, n_jobs=-1, cv=cvKNNt)
grid_searchKNNt.fit(X_train, np.ravel(y_train,order='C'))
best_modelKNNt = grid_searchKNNt.best_estimator_

predsKNNt = best_modelKNNt.predict(X_test)

resKNNt = f1_score(y_test, predsKNNt, average = "weighted")

In [45]:
res

0.6159341532309657

In [None]:
predsKNNtholdout = best_modelKNNt.predict(X_holdout)
resKNNtholdout = f1_score(y_holdout, predsKNNtholdout, average = "weighted")
resKNNtholdout

## Tuning SVM

No use, is bad

In [46]:
SVMt = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]

In [47]:
gridSVM = dict(kernel = kernel,C=C)
cvSVM = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_searchSVM = GridSearchCV(estimator=SVMt, param_grid=gridSVM, n_jobs=-1, cv=cvSVM)
grid_searchSVM.fit(X_train, np.ravel(y_train,order='C'))
best_modelSVM = grid_searchSVM.best_estimator_

predsSVM = best_modelSVM.predict(X_test)

resSVM = f1_score(y_test, predsSVM, average = "weighted")

In [48]:
resSVM

0.6250097359789563

In [None]:
predsSVMtholdout = best_modelSVM.predict(X_holdout)
resSVMtholdout = f1_score(y_holdout, predsSVMtholdout, average = "weighted")
resSVMtholdout

# Making predictions

In [92]:
testDF = pd.read_csv("Clinical_Data.test.txt", sep = "\t")
# testDF.drop(["Unnamed: 0"], axis = 1, inplace = True)
testDF

Unnamed: 0,ID_new,Gender,Height,Weight,AgeAtDiag,SmokeAtDiag,Leu_signe,Leu,Hb_signe,Hb,...,Fer_signe,Fer,B12_signe,B12,Fol_signe,Fol,Alk_signe,Alk,Alb_signe,Alb
0,2,2,163,52,16,1,,,,,...,,,,,,,,,,
1,8,1,170,77,59,1,,6.9,,169.0,...,,524.0,,128.0,,4.4,,53.0,,
2,13,2,158,49,21,0,,13.5,,130.0,...,,,,,,,,,,
3,24,1,180,72,10,0,,9.3,,150.0,...,,43.0,,289.0,,12.6,,68.0,,
4,55,2,164,58,34,1,,4.5,,141.0,...,,,,,,,,,,
5,62,1,182,80,43,1,,4.1,,126.0,...,,16.0,,171.0,,25.6,,53.0,,
6,95,1,168,54,19,1,,10.9,,127.0,...,,630.0,,9188.0,<,<459.2,,56.0,,28.0
7,104,2,150,44,41,1,,8.5,,113.0,...,,36.0,,135.0,,18.7,,82.0,,30.0
8,153,1,175,95,34,0,,10.7,,141.0,...,,,,,,,,84.0,,
9,159,2,159,55,54,1,,15.0,,131.0,...,,258.0,,436.0,,840,,73.0,,41.0


In [93]:
df1 = testDF.copy()
df1.dtypes

ID_new           int64
Gender           int64
Height           int64
Weight           int64
AgeAtDiag        int64
SmokeAtDiag      int64
Leu_signe      float64
Leu            float64
Hb_signe       float64
Hb             float64
CRP_signe       object
CRP             object
ESR_signe      float64
ESR            float64
Fer_signe      float64
Fer            float64
B12_signe      float64
B12            float64
Fol_signe       object
Fol             object
Alk_signe      float64
Alk            float64
Alb_signe      float64
Alb            float64
dtype: object

In [94]:
df1.columns

Index(['ID_new', 'Gender', 'Height', 'Weight', 'AgeAtDiag', 'SmokeAtDiag',
       'Leu_signe', 'Leu', 'Hb_signe', 'Hb', 'CRP_signe', 'CRP', 'ESR_signe',
       'ESR', 'Fer_signe', 'Fer', 'B12_signe', 'B12', 'Fol_signe', 'Fol',
       'Alk_signe', 'Alk', 'Alb_signe', 'Alb'],
      dtype='object')

In [95]:
bad_cols = ['Leu_signe', 'CRP_signe', 'Hb_signe', "Fer_signe", "Fol_signe", "B12_signe", "Alk_signe", "Alb_signe"]

df1.drop(['ESR_signe','Leu_signe', 'CRP_signe', 'Hb_signe', "Fer_signe", "Fol_signe", "B12_signe", "Alk_signe", "Alb_signe"], axis = 1, inplace = True)
testDF.drop(['ESR_signe','Leu_signe', 'CRP_signe', 'Hb_signe', "Fer_signe", "Fol_signe", "B12_signe", "Alk_signe", "Alb_signe"], axis = 1, inplace = True)

In [96]:
string_types = []

for column in df1.columns:
    if df1[column].dtype == "object":
        print(column)
        string_types.append(column)
        df1[column] = df1[column].astype("string")

CRP
Fol


In [97]:
for column in string_types:
    df1[column] = df1[column].str.replace('<', '')
    df1[column] = df1[column].str.replace('>', '')

for column in string_types:
    if column in string_types:
        print(column)
        df1[column] = df1[column].astype("float64")
df1.dtypes

CRP
Fol


ID_new           int64
Gender           int64
Height           int64
Weight           int64
AgeAtDiag        int64
SmokeAtDiag      int64
Leu            float64
Hb             float64
CRP            float64
ESR            float64
Fer            float64
B12            float64
Fol            float64
Alk            float64
Alb            float64
dtype: object

In [98]:
# simple func to input numerical 

imputer = SimpleImputer(strategy = "mean") 
df1 = pd.DataFrame(imputer.fit_transform(df1))

df1.columns = testDF.columns

df1

Unnamed: 0,ID_new,Gender,Height,Weight,AgeAtDiag,SmokeAtDiag,Leu,Hb,CRP,ESR,Fer,B12,Fol,Alk,Alb
0,2.0,2.0,163.0,52.0,16.0,1.0,8.990263,132.263158,11.0,21.026316,238.464286,761.825,143.927273,73.129032,38.363636
1,8.0,1.0,170.0,77.0,59.0,1.0,6.9,169.0,2.0,1.0,524.0,128.0,4.4,53.0,38.363636
2,13.0,2.0,158.0,49.0,21.0,0.0,13.5,130.0,62.7,30.0,238.464286,761.825,143.927273,73.129032,38.363636
3,24.0,1.0,180.0,72.0,10.0,0.0,9.3,150.0,0.9,21.026316,43.0,289.0,12.6,68.0,38.363636
4,55.0,2.0,164.0,58.0,34.0,1.0,4.5,141.0,8.0,6.0,238.464286,761.825,143.927273,73.129032,38.363636
5,62.0,1.0,182.0,80.0,43.0,1.0,4.1,126.0,1.0,21.026316,16.0,171.0,25.6,53.0,38.363636
6,95.0,1.0,168.0,54.0,19.0,1.0,10.9,127.0,28.0,56.0,630.0,9188.0,459.2,56.0,28.0
7,104.0,2.0,150.0,44.0,41.0,1.0,8.5,113.0,1.0,21.026316,36.0,135.0,18.7,82.0,30.0
8,153.0,1.0,175.0,95.0,34.0,0.0,10.7,141.0,9.5,21.026316,238.464286,761.825,143.927273,84.0,38.363636
9,159.0,2.0,159.0,55.0,54.0,1.0,15.0,131.0,46.0,40.0,258.0,436.0,840.0,73.0,41.0


In [99]:
continuous = ['Height', 'Weight', 'AgeAtDiag', 'Leu', 'Hb', 'Hb', 'CRP', 'ESR', "Fer", "B12",
             "Fol", "Alk", 'Alb']

scaler = StandardScaler()

for var in continuous:
    df1[var] = df1[var].astype('float64')
    df1[var] = scaler.fit_transform(df1[var].values.reshape(-1, 1))
    
df1

Unnamed: 0,ID_new,Gender,Height,Weight,AgeAtDiag,SmokeAtDiag,Leu,Hb,CRP,ESR,Fer,B12,Fol,Alk,Alb
0,2.0,2.0,-0.781231,-1.323558,-1.140254,1.0,0.0,4.708521e-16,0.02111664,0.0,0.0,-8.097831000000001e-17,0.0,0.0,0.0
1,8.0,1.0,-0.056911,0.434155,1.813416,1.0,-0.305193,2.420335,-0.4782842,-1.461421,0.757332,-0.4514689,-0.630685,-0.8849,0.0
2,13.0,2.0,-1.298602,-1.534484,-0.796804,0.0,0.658453,-0.1491037,2.889897,0.654855,0.0,-8.097831000000001e-17,0.0,0.0,0.0
3,24.0,1.0,0.977832,0.082613,-1.552394,0.0,0.045224,1.168557,-0.5393221,0.0,-0.518434,-0.3367898,-0.59362,-0.225479,0.0
4,55.0,2.0,-0.677756,-0.901707,0.096166,1.0,-0.65561,0.5756097,-0.1453503,-1.096546,0.0,-8.097831000000001e-17,0.0,0.0,0.0
5,62.0,1.0,1.18478,0.645081,0.714376,1.0,-0.714012,-0.4126359,-0.5337732,0.0,-0.590046,-0.4208404,-0.534858,-0.8849,0.0
6,95.0,1.0,-0.263859,-1.182941,-0.934184,1.0,0.278835,-0.3467529,0.9644293,2.552206,1.038478,6.001903,1.425081,-0.753016,-2.372761
7,104.0,2.0,-2.126396,-1.886027,0.576996,1.0,-0.071582,-1.269115,-0.5337732,0.0,-0.537,-0.4464829,-0.566047,0.38998,-1.91486
8,153.0,1.0,0.460461,1.699709,0.096166,0.0,0.249633,0.5756097,-0.06211683,0.0,0.0,-8.097831000000001e-17,0.0,0.477903,0.0
9,159.0,2.0,-1.195128,-1.112633,1.469966,1.0,0.877463,-0.08322068,1.963231,1.384605,0.051815,-0.2320828,3.146356,-0.005672,0.603597


In [100]:
df1.drop("ID_new", axis = 1, inplace = True)

In [108]:
subm_preds = best_model.predict(df1)

In [109]:
subm = pd.DataFrame()
subm['PatientId'] = testDF["ID_new"]
subm['Diag'] = subm_preds
subm['Diag'] = subm['Diag'].astype("int")

In [110]:
subm.to_csv("basic.csv", index = False)
subm

Unnamed: 0,PatientId,Diag
0,2,0
1,8,0
2,13,0
3,24,0
4,55,0
5,62,0
6,95,0
7,104,0
8,153,0
9,159,0
