In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine # Engine required to read the SQL from PGAdmin4
import lightgbm as lgb

CONNSTRING='postgresql+psycopg://postgres:postgres@localhost/HealthData'

In [None]:
# Create a database engine
engine = create_engine(CONNSTRING)

# Define your SQL query
query = "SELECT * FROM PatientData"

# Use pandas to read the data into a DataFrame
liver_data_df = pd.read_sql(query, engine)
liver_data_df.head()

## Exploratory Data Analysis

In [None]:
liver_data_df.info()

In [None]:
# Plot histograms for each feature
liver_data_df.hist(figsize=(12, 12), bins=20)
plt.tight_layout()
plt.show()

In [None]:
# Plot boxplots for each feature
liver_data_df.plot(kind='box', subplots=True, layout=(4, 4), figsize=(12, 12), sharex=False, sharey=False)
plt.tight_layout()
plt.show()

In [None]:
# Calculate the correlation matrix
correlation_matrix = liver_data_df.corr()

# Display the correlation matrix
print(correlation_matrix)

# Plot the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

## Generally, the interpretation of correlation coefficients is as follows:

### 0.0 to 0.3 (or 0.0 to -0.3): Weak correlation
### 0.3 to 0.7 (or -0.3 to -0.7): Moderate correlation
### 0.7 to 1.0 (or -0.7 to -1.0): Strong correlation

In [None]:
# Plot boxplots for each feature against the target variable
for column in liver_data_df.columns:
    if column != 'diagnosis':  # Replace 'Target' with your binary classification target column name
        plt.figure(figsize=(8, 4))
        sns.boxplot(x='diagnosis', y=column, data=liver_data_df)  # Replace 'Target' with your binary classification target column name
        plt.title(f'Boxplot of {column} by diagnosis')
        plt.show()

# Now let's find a suitable Machine Learning model to try to predict Liver Disease based on the available variables

In [None]:
# Import our dependencies related to ML

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import sklearn as skl
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Split our data into our features and target arrays
X = liver_data_df.copy()
X.drop('diagnosis', axis=1, inplace=True)
y = liver_data_df['diagnosis']

In [None]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Create a StandardScaler instances
X_scaler = skl.preprocessing.StandardScaler()

# Fit the StandardScaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Compile, train and evaluate the model (with parameters optimization using kerastuner)

In [None]:
# Create a method that creates a Sequential model with hyperparameter options using Keras Tuner
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])

    # Allow kerastuner to decide number of neurons in first layer
    input_dim = X_train_scaled.shape[1]  # Automatically calculates the number of features
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=input_dim))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [None]:
# Import the kerastuner library
import keras_tuner as kt

In [None]:
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2,
    overwrite = True)

In [None]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

In [None]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

In [None]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

## With kerastuner, the best accuracy obtained for the model was around 83%.
## More models will be attempted to improve accuracy:

##### 1. Random Forest, which is relatively simple to understand and interpret, and in this case adapts well since it's a small dataset.
##### 2. CatBoost, which works well when handling categorical features, which in this dataset we have 5 out of 10.
##### 3. XGBoost, which is known for its high performance and uses a large number of hyperparameters.
##### 4. LightGBM, which has good performance in terms of accuracy and predictive power, also handling categorical features well.

### Compile, train and evaluate the model (with LightGBM)

In [None]:
# Create LightGBM datasets
train_data = lgb.Dataset(X_train_scaled, label=y_train)
test_data = lgb.Dataset(X_test_scaled, label=y_test, reference=train_data)

In [None]:
# Set up the parameters for LightGBM
params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

In [None]:
# Train the LightGBM model
model = lgb.train(params, train_data, num_boost_round=1000, valid_sets=[test_data], callbacks=[lgb.early_stopping(stopping_rounds=50)])


In [None]:
# Predict the test set
y_pred = model.predict(X_test_scaled, num_iteration=model.best_iteration)
y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_binary)
conf_matrix = confusion_matrix(y_test, y_pred_binary)
class_report = classification_report(y_test, y_pred_binary)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

# Accuracy was imroved from 83% to 91%