In [0]:
# ==============================================================================
# SETUP - IMPORTING LIBRARIES
# ==============================================================================
# This section imports all the necessary libraries for data manipulation,
# machine learning, and data visualization.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import plot_tree
import seaborn as sns
import matplotlib.pyplot as plt

# This command ensures that plots are displayed directly in the notebook.
%matplotlib inline

# ==============================================================================
# DATA LOADING AND PREPARATION
# ==============================================================================
# Here, we load the patient symptom data from the intermediate table created
# in the previous SQL step. The data is then converted into a Pandas DataFrame.

# SQL query to select all data from the prepared table.
patients_query = """
SELECT * FROM dev_db.intermediate.int_patients_symptoms
"""

# Execute the query and load data into a Spark DataFrame.
# Note: This assumes a Spark environment is available.
df_spark = spark.sql(patients_query)

# Convert the Spark DataFrame to a Pandas DataFrame for use with scikit-learn.
df_pandas = df_spark.toPandas()


# ==============================================================================
# FEATURE ENGINEERING
# ==============================================================================
# The model needs numerical input. This section converts the 'Yes'/'No' values
# in the symptom columns into 1s and 0s.

# List of columns that contain symptom information.
symptom_columns = ['fever', 'cough', 'runny_nose', 'headache', 'body_pain', 'fatigue', 'nausea', 'diarrhea']

# Loop through each symptom column and map 'Yes' to 1 and 'No' to 0.
for col in symptom_columns:
    df_pandas[col] = df_pandas[col].map({'Yes': 1, 'No': 0})

# Separate the features (symptoms, X) from the target variable (diagnose, y).
X = df_pandas[symptom_columns] # Features (what we use to predict)
y = df_pandas['diagnose']      # Target (what we want to predict)

# Display the first few rows of the features to verify the transformation.
print("Transformed Feature Data (X):")
display(X.head())


# ==============================================================================
# MODEL TRAINING AND PREDICTION
# ==============================================================================
# The data is split into training and testing sets.
# A Random Forest Classifier model is then trained on the training data.

# Split the dataset: 80% for training and 20% for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model.
# n_estimators=100 means the model will build 100 decision trees.
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model using the training data.
model.fit(X_train, y_train)

# Make predictions on the test data.
y_pred = model.predict(X_test)


# ==============================================================================
# MODEL EVALUATION
# ==============================================================================
# We evaluate the model's performance using accuracy and a classification report.
# A confusion matrix is also generated to visualize performance.

# Calculate the accuracy of the model.
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}\n')

# Print a detailed classification report.
# This shows precision, recall, and f1-score for each diagnosis class.
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Create and display the confusion matrix.
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Diagnosis')
plt.ylabel('Actual Diagnosis')
plt.show()


# ==============================================================================
# FEATURE IMPORTANCE ANALYSIS
# ==============================================================================
# This visualization shows which symptoms (features) were most important
# in making the predictions.

# Get feature importances from the trained model.
feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)

# Plot the feature importances.
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances, y=feature_importances.index)
plt.title("Symptom Importance in Diagnosis")
plt.xlabel("Importance Score")
plt.ylabel("Symptom")
plt.show()


# ==============================================================================
# DECISION TREE VISUALIZATION
# ==============================================================================
# To understand how the model makes decisions, we can visualize one of the
# individual decision trees from the Random Forest.

plt.figure(figsize=(25, 20))

# Plot the first tree (estimator) in the forest.
plot_tree(model.estimators_[0],
          feature_names=X.columns.tolist(),
          class_names=model.classes_.tolist(),
          filled=True,
          rounded=True,
          fontsize=10)

plt.title("Example Decision Tree from the Random Forest Model")
plt.show()
