In [5]:
import tensorflow as tf
import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle
from keras.layers import * 
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.models import Sequential

In [33]:
def main():
  data = read_data()
  clean_data(data)
  data = handle_missing_values(data)
  axes = min_max_scaling(data)
  results = modify_dimensionality(data,axes[0],axes[1])

  # Result Printing
  indexs = ["Cross value score:","\nNeural Network Accuracy score:"]

  result_no = 0
  
  for result in results:
    print(indexs[result_no])
    result_no += 1
    for key, value in result.items():
      print ("  ",key,":" ,value)
  print("\n\n")

In [7]:
def read_data():

  # convert data into dataframe
  data = pd.read_csv("kidney_disease_fixed.csv", encoding= "unicode_escape")
  data = data.drop("id", axis = 1)
  data = shuffle(data, random_state=19)
  return data

In [8]:
def clean_data(data):
  # Save tags and replace all empty values with "?"

  tags = []

  for col in data:
    tags.append(col)
    for value in data[col]:
      if "?" in str(value) or " " in str(value):
        data[col] = data[col].replace(to_replace = value, value = np.nan)

  # Arrange column types

  for tag in tags:
    try:
      data[tag] = pd.to_numeric(data[tag])
    except:
      pass

  # Classify column types

  non_numeric = [col for col in data if data[col].dtype == object]
  numeric = [col for col in data if data[col].dtype == float]

  # Confirm numerics

  for col in numeric:
    pd.to_numeric(data[col])

  # Conversion of non_numerics to binary
  # Cleaning before conversion

  data.dm = data.dm.replace("\tno", "no").replace(" yes", "yes").replace("\tyes", "yes")
  data.cad = data.cad.replace("\tno", "no")
  data.classification=data.classification.replace("ckd\t","ckd")

  # Conversion to binary

  data['rbc'] = data['rbc'].replace(['normal','abnormal'], [1,0])
  data['pc'] = data['pc'].replace(['normal','abnormal'], [1,0])
  data['pcc'] = data['pcc'].replace(['present','notpresent'], [1,0])
  data['ba'] = data['ba'].replace(['present','notpresent'], [1,0])
  data['htn'] = data['htn'].replace(['yes','no'], [1,0])
  data['dm'] = data['dm'].replace(['yes','no'], [1,0])
  data['cad'] = data['cad'].replace(['yes','no'], [1,0])
  data['appet'] = data['appet'].replace(['good','poor'], [1,0])
  data['pe'] = data['pe'].replace(['yes','no'], [1,0])
  data['ane'] = data['ane'].replace(['yes','no'], [1,0])
  data['classification'] = data['classification'].replace(['ckd','notckd'], [1,0])

In [9]:
def handle_missing_values(data):

  from sklearn.impute import KNNImputer

  imputer = KNNImputer(n_neighbors=5)
  data = pd.DataFrame(imputer.fit_transform(data),columns = data.columns)
  return(data)

In [10]:
def min_max_scaling(data):

  from sklearn.preprocessing import MinMaxScaler

  scaler = MinMaxScaler()
  data = scaler.fit_transform(data)

  # Assigning the dimensions of data
  X = data[:,:-1]
  Y = data[:,-1]
  return(X,Y)

In [41]:
def modify_dimensionality(data,X,Y):

  from sklearn.decomposition import PCA

  # Result dictionaries
  NN_results = {}
  CM_cvs_results = {}

  # Loop dimensionality
  for dimension in range(2,25):
    dimensionality = dimension

    # Reduce dimensionality

    try:
      pca = PCA(n_components=dimensionality)
      principalComponents = pca.fit_transform(X)
      X = pd.DataFrame(principalComponents, columns = ["PC1", "PC2"])
      X = X.to_numpy()
    except:
      pass

    train_test_split(data)
    apply_classification_model(X,Y,dimensionality,CM_cvs_results)
    apply_neural_network(X,Y,dimensionality,NN_results)

  # Return result dictionaries
  return(CM_cvs_results,NN_results)

In [12]:
def train_test_split(data):
  train_size = int(len(data) * .8)
  test_size = len(data) - train_size

  df_train = data[:train_size]
  df_test = data[train_size:]

In [36]:
def apply_classification_model(X,Y,dimensionality,CM_cvs_results):
  # Import
  from sklearn import svm
  from sklearn import metrics
  from sklearn.model_selection import cross_val_score
  from sklearn.model_selection import train_test_split

  # Initialize test and train
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)


  def evaluateCVS(results, model, model_name):
    scores = cross_val_score(model, X, Y, cv=5, scoring='accuracy')
    CM_cvs_results["D(" + str(dimensionality) + ")_" + model_name + "_accuracy"] = '%.4f' % (scores.mean()*100)
    scores = cross_val_score(model, X, Y, cv=5, scoring='f1')
    CM_cvs_results["D(" + str(dimensionality) + ")_" + model_name + "_f1_score"] = '%.4f' % (scores.mean()*100)
    scores = cross_val_score(model, X, Y, cv=5, scoring='recall')
    CM_cvs_results["D(" + str(dimensionality) + ")_" + model_name + "_recall_score"] = '%.4f' % (scores.mean()*100)

  # Support Vector Machine

  supvm = svm.SVC(decision_function_shape='ovo')

  # Cross Validation Evaluation 
  supvm.fit(X_train,Y_train)
  results = supvm.predict(X_test)

  evaluateCVS(results, supvm, "SVM")


  # Logistic Regression

  from sklearn.linear_model import LogisticRegression

  logreg = LogisticRegression()

  # Cross Validation Evaluation 
  logreg.fit(X_train, Y_train)
  results = logreg.predict(X_test)
  evaluateCVS(results, logreg, "LogReg")


  # K-Nearest Neighbor

  from sklearn.neighbors import KNeighborsClassifier

  for neighbor in [1,5]:
    knn = KNeighborsClassifier(n_neighbors=neighbor)

    # Cross Validation Evaluation 
    knn.fit(X_train, Y_train)
    results = knn.predict(X_test)

    evaluateCVS(results, knn, f"KNN-{neighbor}")


  # Decision Tree 

  from sklearn.tree import DecisionTreeClassifier

  for technique in ["gini", "entropy"]:
    clf = DecisionTreeClassifier(criterion = technique)

    # Cross Validation Evaluation 
    clf.fit(X_train, Y_train)
    results = clf.predict(X_test)

    evaluateCVS(results, clf, f"{technique}_CLF")


  # Random Forest Classifier

  from sklearn.ensemble import RandomForestClassifier

  rfc = RandomForestClassifier()

  # Cross Validation Evaluation 
  rfc.fit(X_train, Y_train)
  results = rfc.predict(X_test)

  evaluateCVS(results, rfc, "RFC")

In [14]:
def apply_neural_network(X,Y,dimensionality,NN_results):
  # Determine model types/neuron count per layer
  neuron_types = ["Type1", "Type2", "Type3"]

  # Initialize types
  neuron_modifier = {"Type1":[24,12,1],
                    "Type2":[24,4,1],
                    "Type3":[24,3,1]}

  # Empty dictionaries for models
  model_accuracy = dict.fromkeys(["Type1", "Type2", "Type3"])
  model_history = dict.fromkeys(["Type1", "Type2", "Type3"])

  # Initialize test and train
  from sklearn.model_selection import train_test_split

  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

  # Creating the model


  def create_model(layer_size,input_shape):
    model = Sequential(name = "Sequential_Test_Model")
    model.add(Input(shape = (input_shape)))
    model.add(Dense(layer_size[1], activation = 'relu'))
    model.add(Dense(layer_size[2], activation = 'sigmoid'))
    return model

  for type_ in neuron_types:
    model = create_model(neuron_modifier[type_],X_train[0].shape)
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy']) 
    model.summary()

    # Fit model
    model_history[type_] = model.fit(X_train, Y_train, epochs=10)

    # Evaluate model
    accuracy = model.evaluate(X_test, Y_test)
    model_accuracy[type_] = accuracy[1]

    NN_results[f"d({dimensionality})"] = model_accuracy

In [40]:
if "__main__" == __name__:
    main()


Cross value score:
   D(2)_SVM_accuracy : 96.7500
   D(2)_SVM_f1_score : 97.3809
   D(2)_SVM_recall_score : 97.2000
   D(2)_LogReg_accuracy : 93.7500
   D(2)_LogReg_f1_score : 94.7807
   D(2)_LogReg_recall_score : 91.6000
   D(2)_KNN-1_accuracy : 97.0000
   D(2)_KNN-1_f1_score : 97.5959
   D(2)_KNN-1_recall_score : 97.6000
   D(2)_KNN-5_accuracy : 96.2500
   D(2)_KNN-5_f1_score : 97.0294
   D(2)_KNN-5_recall_score : 98.0000
   D(2)_gini_CLF_accuracy : 97.0000
   D(2)_gini_CLF_f1_score : 97.3809
   D(2)_gini_CLF_recall_score : 98.0000
   D(2)_entropy_CLF_accuracy : 97.0000
   D(2)_entropy_CLF_f1_score : 97.1711
   D(2)_entropy_CLF_recall_score : 98.0000
   D(2)_RFC_accuracy : 97.2500
   D(2)_RFC_f1_score : 97.6157
   D(2)_RFC_recall_score : 98.4000
   D(3)_SVM_accuracy : 96.7500
   D(3)_SVM_f1_score : 97.3809
   D(3)_SVM_recall_score : 97.2000
   D(3)_LogReg_accuracy : 93.7500
   D(3)_LogReg_f1_score : 94.7807
   D(3)_LogReg_recall_score : 91.6000
   D(3)_KNN-1_accuracy : 97.0000
   D(3