### Data Exploration

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from sqlalchemy import create_engine # Engine required to read the SQL from PGAdmin4
from config import CONNSTRING

In [None]:
# Create a database engine
engine = create_engine(CONNSTRING)

# Define your SQL query
query = "SELECT * FROM PatientData"

# Use pandas to read the data into a DataFrame
liver_data_df = pd.read_sql(query, engine)
liver_data_df.head()

## Exploratory Data Analysis

In [None]:
liver_data_df.info()

In [None]:
# Plot histograms for each feature
liver_data_df.hist(figsize=(12, 12), bins=20)
plt.tight_layout()
plt.show()

In [None]:
# Plot boxplots for each feature
liver_data_df.plot(kind='box', subplots=True, layout=(4, 4), figsize=(12, 12), sharex=False, sharey=False)
plt.tight_layout()
plt.show()

In [None]:
# Calculate the correlation matrix
correlation_matrix = liver_data_df.corr()

# Display the correlation matrix
print(correlation_matrix)

# Plot the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

## Generally, the interpretation of correlation coefficients is as follows:

### 0.0 to 0.3 (or 0.0 to -0.3): Weak correlation
### 0.3 to 0.7 (or -0.3 to -0.7): Moderate correlation
### 0.7 to 1.0 (or -0.7 to -1.0): Strong correlation

In [None]:
# Plot boxplots for each feature against the target variable
for column in liver_data_df.columns:
    if column != 'diagnosis':  # Replace 'Target' with your binary classification target column name
        plt.figure(figsize=(8, 4))
        sns.boxplot(x='diagnosis', y=column, data=liver_data_df)  # Replace 'Target' with your binary classification target column name
        plt.title(f'Boxplot of {column} by diagnosis')
        plt.show()

# Now let's find a suitable Machine Learning model to try to predict Liver Disease based on the available variables
##### (Run each model separately on the respective jupyter notebooks)