In [3]:
import tensorflow as tf
import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle
from keras.layers import * 
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.models import Sequential

In [4]:
def main():
  data = read_data()
  clean_data(data)
  data = handle_missing_values(data)
  axes = min_max_scaling(data)
  results = modify_dimensionality(data,axes[0],axes[1])

  # Result Printing
  indexs = ["Cross value score:","\nTrain-Test score:","\nNeural Network Accuracy score:"]

  for result in results:
    result_no = 0
    print(indexs[result_no])
    for key, value in result.items():
      print ("  ",key,":" ,value)
    result_no += 1
  print("\n\n")

In [5]:
def read_data():

  # convert data into dataframe
  data = pd.read_csv("kidney_disease_fixed.csv", encoding= "unicode_escape")
  data = data.drop("id", axis = 1)
  data = shuffle(data, random_state=19)
  return data

In [6]:
def clean_data(data):
  # Save tags and replace all empty values with "?"

  tags = []

  for col in data:
    tags.append(col)
    for value in data[col]:
      if "?" in str(value) or " " in str(value):
        data[col] = data[col].replace(to_replace = value, value = np.nan)

  # Arrange column types

  for tag in tags:
    try:
      data[tag] = pd.to_numeric(data[tag])
    except:
      pass

  # Classify column types

  non_numeric = [col for col in data if data[col].dtype == object]
  numeric = [col for col in data if data[col].dtype == float]

  # Confirm numerics

  for col in numeric:
    pd.to_numeric(data[col])

  # Conversion of non_numerics to binary
  # Cleaning before conversion

  data.dm = data.dm.replace("\tno", "no").replace(" yes", "yes").replace("\tyes", "yes")
  data.cad = data.cad.replace("\tno", "no")
  data.classification=data.classification.replace("ckd\t","ckd")

  # Conversion to binary

  data['rbc'] = data['rbc'].replace(['normal','abnormal'], [1,0])
  data['pc'] = data['pc'].replace(['normal','abnormal'], [1,0])
  data['pcc'] = data['pcc'].replace(['present','notpresent'], [1,0])
  data['ba'] = data['ba'].replace(['present','notpresent'], [1,0])
  data['htn'] = data['htn'].replace(['yes','no'], [1,0])
  data['dm'] = data['dm'].replace(['yes','no'], [1,0])
  data['cad'] = data['cad'].replace(['yes','no'], [1,0])
  data['appet'] = data['appet'].replace(['good','poor'], [1,0])
  data['pe'] = data['pe'].replace(['yes','no'], [1,0])
  data['ane'] = data['ane'].replace(['yes','no'], [1,0])
  data['classification'] = data['classification'].replace(['ckd','notckd'], [1,0])

In [7]:
def handle_missing_values(data):

  from sklearn.impute import KNNImputer

  imputer = KNNImputer(n_neighbors=5)
  data = pd.DataFrame(imputer.fit_transform(data),columns = data.columns)
  return(data)

In [8]:
def min_max_scaling(data):

  from sklearn.preprocessing import MinMaxScaler

  scaler = MinMaxScaler()
  data = scaler.fit_transform(data)

  # Assigning the dimensions of data
  X = data[:,:-1]
  Y = data[:,-1]
  return(X,Y)

In [9]:
def modify_dimensionality(data,X,Y):

  from sklearn.decomposition import PCA

  # Result dictionaries
  NN_results = {}
  CM_cvs_results = {}
  CM_tt_results = {}

  # Loop dimensionality

  for dimension in range(2,24):
    dimensionality = dimension

    # Reduce dimensionality

    try:
      pca = PCA(n_components=dimensionality)
      principalComponents = pca.fit_transform(X)
      X = pd.DataFrame(principalComponents, columns = ["PC1", "PC2"])
      X = X.to_numpy()
    except:
      pass

    train_test_split(data)
    apply_classification_model(X,Y,dimensionality,CM_cvs_results,CM_tt_results)
    apply_neural_network(X,Y,dimensionality,NN_results)

  # Return result dictionaries
  return(CM_cvs_results,CM_tt_results,NN_results)

In [10]:
def train_test_split(data):
  train_size = int(len(data) * .8)
  test_size = len(data) - train_size

  df_train = data[:train_size]
  df_test = data[train_size:]

In [11]:
def apply_classification_model(X,Y,dimensionality,CM_cvs_results,CM_tt_results):
  # Import
  from sklearn import svm
  from sklearn import metrics
  from sklearn.model_selection import cross_val_score
  from sklearn.model_selection import train_test_split

  # Initialize test and train
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)


  def evaluateCVS(results):
    clf = svm.SVC(kernel='linear', C=1)
    scores = cross_val_score(clf, X, Y, cv=5, scoring='accuracy')
    CM_cvs_results["D(" + str(dimensionality) + ")_accuracy"] = '%.4f' % (scores.mean()*100)
    scores = cross_val_score(clf, X, Y, cv=5, scoring='f1')
    CM_cvs_results["D(" + str(dimensionality) + ")_f1_score"] = '%.4f' % (scores.mean()*100)
    scores = cross_val_score(clf, X, Y, cv=5, scoring='recall')
    CM_cvs_results["D(" + str(dimensionality) + ")_recall_score"] = '%.4f' % (scores.mean()*100)
    
  def evaluateTT(results,model):
    CM_tt_results["D(" + str(dimensionality) + ")_" + model +"_accuracy"] = '%.4f' % (metrics.accuracy_score(Y_test, results) * 100)
    CM_tt_results["D(" + str(dimensionality) + ")_" + model +"_f1_score"] = '%.4f' % (metrics.f1_score(Y_test, results) * 100)
    CM_tt_results["D(" + str(dimensionality) + ")_" + model +"_recall_score"] = '%.4f' % (metrics.recall_score(Y_test, results) * 100)


  # Support Vector Machine

  supvm = svm.SVC(decision_function_shape='ovo')

  # Entire Data
  supvm.fit(X,Y)
  results = supvm.predict(X)

  evaluateCVS(results)

  # Train-test Data
  supvm.fit(X_train, Y_train)
  results = supvm.predict(X_test)

  evaluateTT(results, "SVM")



  # Logistic Regression

  from sklearn.linear_model import LogisticRegression

  logreg = LogisticRegression()

  # Entire Data
  logreg.fit(X, Y)
  results = logreg.predict(X)
  evaluateCVS(results)

  # Train-test Data
  logreg.fit(X_train, Y_train)
  results = logreg.predict(X_test)
  evaluateTT(results, "LogReg")



  # K-Nearest Neighbor

  from sklearn.neighbors import KNeighborsClassifier

  for neighbor in [1,5]:
    knn = KNeighborsClassifier(n_neighbors=neighbor)

    # Entire Data
    knn.fit(X, Y)
    results = knn.predict(X)

    evaluateCVS(results)

    # Train-test Data
    knn.fit(X_train, Y_train)
    results = knn.predict(X_test)

    evaluateTT(results, "KNN")



  # Decision Tree 

  from sklearn.tree import DecisionTreeClassifier

  for technique in ["gini", "entropy"]:
    clf = DecisionTreeClassifier(criterion = technique)

    # Entire Data
    clf.fit(X, Y)
    results = clf.predict(X)

    evaluateCVS(results)

    # Train-test Data
    clf.fit(X_train, Y_train)
    results = clf.predict(X_test)

    evaluateTT(results, "DecTree")



  # Random Forest Classifier

  from sklearn.ensemble import RandomForestClassifier

  rfc = RandomForestClassifier()

  # Entire Data
  rfc.fit(X, Y)
  results = rfc.predict(X)

  evaluateCVS(results)

  # Train-test Data
  rfc.fit(X_train, Y_train)
  results = rfc.predict(X_test)

  evaluateTT(results, "RandFor")

In [12]:
def apply_neural_network(X,Y,dimensionality,NN_results):
  # Determine model types/neuron count per layer
  neuron_types = ["Type1", "Type2", "Type3"]

  # Initialize types
  neuron_modifier = {"Type1":[24,12,1],
                    "Type2":[24,4,1],
                    "Type3":[24,3,1]}

  # Empty dictionaries for models
  model_accuracy = dict.fromkeys(["Type1", "Type2", "Type3"])
  model_history = dict.fromkeys(["Type1", "Type2", "Type3"])

  # Initialize test and train
  from sklearn.model_selection import train_test_split

  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

  # Creating the model


  def create_model(layer_size,input_shape):
    model = Sequential(name = "Sequential_Test_Model")
    model.add(Input(shape = (input_shape)))
    model.add(Dense(layer_size[1], activation = 'relu'))
    model.add(Dense(layer_size[2], activation = 'sigmoid'))
    return model

  for type_ in neuron_types:
    model = create_model(neuron_modifier[type_],X_train[0].shape)
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy']) 
    model.summary()

    # Fit model
    model_history[type_] = model.fit(X_train, Y_train, epochs=10)

    # Evaluate model
    accuracy = model.evaluate(X_test, Y_test)
    model_accuracy[type_] = accuracy[1]

    NN_results[f"d({dimensionality})"] = model_accuracy

In [14]:
if "__main__" == __name__:
    main()


Model: "Sequential_Test_Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_18 (Dense)            (None, 12)                300       
                                                                 
 dense_19 (Dense)            (None, 1)                 13        
                                                                 
Total params: 313
Trainable params: 313
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "Sequential_Test_Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_20 (Dense)            (None, 4)                 100       
                                                                 
 dense_21 (Dense)            (None, 1