In [285]:
%cd /content/drive/MyDrive/ML/Final

/content/drive/MyDrive/ML/Final


In [286]:
!ls

 final.ipynb  'Final Project Requirement.xlsx'


#Preprocessed

In [287]:
import pandas as pd

df = pd.read_excel('Final Project Requirement.xlsx')


In [288]:
df1 = df.dropna(subset=['Therapeutic Dose of Warfarin'], axis=0)
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5528 entries, 0 to 5699
Data columns (total 13 columns):
 #   Column                                                             Non-Null Count  Dtype  
---  ------                                                             --------------  -----  
 0   Gender                                                             5524 non-null   object 
 1   Race (Reported)                                                    5022 non-null   object 
 2   Age                                                                5489 non-null   object 
 3   Height (cm)                                                        4447 non-null   float64
 4   Weight (kg)                                                        5256 non-null   float64
 5   Diabetes                                                           3126 non-null   float64
 6   Simvastatin (Zocor)                                                3753 non-null   float64
 7   Atorvastatin (Lipitor)  

In [289]:
df1.isna().sum()

Gender                                                                  4
Race (Reported)                                                       506
Age                                                                    39
Height (cm)                                                          1081
Weight (kg)                                                           272
Diabetes                                                             2402
Simvastatin (Zocor)                                                  1775
Atorvastatin (Lipitor)                                               3073
Target INR                                                           4269
INR on Reported Therapeutic Dose of Warfarin                          560
Cyp2C9 genotypes                                                      109
VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T    1633
Therapeutic Dose of Warfarin                                            0
dtype: int64

In [290]:
#from sklearn_pandas import CategoricalImputer
#import numpy as np 
# handling NaN values
#imputer = CategoricalImputer()
#data = np.array(df)
#data1 = imputer.fit_transform(data)

In [291]:
train_target=df1['Therapeutic Dose of Warfarin']
train_target.loc[train_target<= 30] = 0
train_target.loc[train_target> 30] = 1
train_target.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


0.0    3124
1.0    2404
Name: Therapeutic Dose of Warfarin, dtype: int64

In [292]:
train_features = df1.drop(columns='Therapeutic Dose of Warfarin')

In [305]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_features, train_target, test_size=0.3, random_state=42)
X_train.shape

(3869, 12)

In [294]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import preprocessing

# select features types
float64_features = list(train_features.columns[train_features.dtypes == 'float64'])#train_features.select_dtypes(include=['float64']).columns
object_features = train_features.select_dtypes(include=['object']).columns

In [295]:
# create transformer for each feature types
float64_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')),('scaler', StandardScaler())])

object_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [296]:
# create a process
preprocessor = ColumnTransformer(
    transformers=[
        ('float64', float64_transformer, float64_features),
        ('object', object_transformer, object_features)])

#Models

##LogisticRegression

In [297]:
from sklearn.linear_model import LogisticRegression

LR = Pipeline(steps=[('preprocessor', preprocessor),
                  ('classifier', LogisticRegression(max_iter=10000))])  
LR.fit(X_train, y_train)
LR.predict(X_test)

array([1., 0., 1., ..., 0., 0., 0.])

##Support Vector Machine

In [298]:
from sklearn.svm import SVC

svm_model_polynomial = Pipeline(steps=[('preprocessor', preprocessor),
                  ('classifier', SVC(kernel = 'poly', degree = 3, C = 10 ))])  
svm_model_polynomial.fit(X_train, y_train)
svm_model_polynomial.predict(X_test)

array([1., 0., 1., ..., 0., 0., 0.])

## Decision Tree

In [299]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = Pipeline(steps=[('preprocessor', preprocessor),
                  ('classifier', DecisionTreeClassifier(max_depth = 10000))])  
decision_tree_model.fit(X_train, y_train)
decision_tree_model.predict(X_test)

array([1., 0., 1., ..., 0., 0., 1.])

##Neural networks 

In [300]:
from sklearn.neural_network import MLPClassifier

neuralnetwor = Pipeline(steps=[('preprocessor', preprocessor),
                  ('classifier', MLPClassifier(max_iter=10000, alpha=1e-5, 
                             hidden_layer_sizes=(5, 5), random_state=1))])  
neuralnetwor.fit(X_train, y_train)
neuralnetwor.predict(X_test)

array([1., 0., 1., ..., 0., 0., 0.])

#Report accuracy, precision, recall, F1-score and ROC curves (AUC-ROC)

In [301]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
# Make predictions and evaluate
logistic_pred = LR.predict(X_test)
logistic_acc = accuracy_score(y_test, logistic_pred )
logistic_prec = precision_score(y_test, logistic_pred )
logistic_recall = recall_score(y_test, logistic_pred )
logistic_roc = roc_auc_score(y_test, logistic_pred )
logistic_f1 = f1_score(y_test, logistic_pred )

svm_pred = svm_model_polynomial.predict(X_test)
svm_acc = accuracy_score(y_test, svm_pred )
svm_prec = precision_score(y_test, svm_pred )
svm_recall = recall_score(y_test, svm_pred )
svm_roc = roc_auc_score(y_test, svm_pred )
svm_f1 = f1_score(y_test, svm_pred )

decision_tree_pred = decision_tree_model.predict(X_test)
decision_tree_acc = accuracy_score(y_test, decision_tree_pred )
decision_tree_prec = precision_score(y_test, decision_tree_pred )
decision_tree_recall = recall_score(y_test, decision_tree_pred )
decision_tree_roc = roc_auc_score(y_test, decision_tree_pred )
decision_tree_f1 = f1_score(y_test, decision_tree_pred )

neuralnetwor_pred = neuralnetwor.predict(X_test)
neuralnetwor_acc = accuracy_score(y_test, neuralnetwor_pred )
neuralnetwor_prec = precision_score(y_test, neuralnetwor_pred )
neuralnetwor_recall = recall_score(y_test, neuralnetwor_pred )
neuralnetwor_roc = roc_auc_score(y_test, neuralnetwor_pred )
neuralnetwor_f1 = f1_score(y_test, neuralnetwor_pred )

In [302]:
Methods = ['Logistic Regression', 'Support Vector Machine','Decision Tree','Neural Networ']
Accuracy = [logistic_acc, svm_acc, decision_tree_acc, neuralnetwor_acc]
Precision = [logistic_prec, svm_prec, decision_tree_prec, neuralnetwor_prec]
Recall = [logistic_recall, svm_recall, decision_tree_recall, neuralnetwor_recall]
Roc_Auc = [logistic_roc, svm_roc, decision_tree_roc, neuralnetwor_roc]
F1_score = [logistic_f1, svm_f1, decision_tree_f1, neuralnetwor_f1]

d = {
    'Methods':Methods,
    'Accuracy':Accuracy,
    'Precision':Precision,
    'Recall':Recall,
    'Roc_Auc':Roc_Auc,
    'F1_score':F1_score 
}

multiple_evaluation_metrics = pd.DataFrame(d)
multiple_evaluation_metrics

Unnamed: 0,Methods,Accuracy,Precision,Recall,Roc_Auc,F1_score
0,Logistic Regression,0.745027,0.724771,0.661088,0.735003,0.691466
1,Support Vector Machine,0.731465,0.708046,0.644351,0.721061,0.674699
2,Decision Tree,0.669982,0.614141,0.635983,0.665922,0.624872
3,Neural Networ,0.753165,0.726269,0.688285,0.745416,0.706767
