# Assignment 1 - Data Analytics

Basilisco, Farah Jane & Bonachita, Clybel Djen

---

### Key Procedures (Balaned Risk Set Matching)

- The script loads patient data, calculates Mahalanobis distances, and applies optimal risk set matching.
- Uses the Hungarian algorithm (linear sum assignment) for optimal pair matching.
- Implements Wilcoxon signed-rank test for sensitivity analysis.
- Can be expanded with network flow optimization for larger datasets

### Python Implementation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment
from scipy.stats import wilcoxon

# Step 1: Load patient data
def load_data(file_path):
    """Load patient data with treatment times and symptoms."""
    return pd.read_csv(file_path)

# Step 2: Compute distance matrix
def compute_distance_matrix(data, symptom_cols):
    """Compute Mahalanobis distance between patients based on symptoms."""
    symptoms = data[symptom_cols].values
    cov_matrix = np.cov(symptoms.T)
    inv_cov = np.linalg.inv(cov_matrix)
    distance_matrix = cdist(symptoms, symptoms, metric='mahalanobis', VI=inv_cov)
    return distance_matrix

# Step 3: Perform risk set matching using Hungarian Algorithm
def match_patients(data, treatment_col, symptom_cols):
    """Match treated and control patients using optimal pair matching."""
    treated = data[data[treatment_col] == 1]
    control = data[data[treatment_col] == 0]

    distance_matrix = compute_distance_matrix(data, symptom_cols)
    
    treated_idx = treated.index.values
    control_idx = control.index.values
    cost_matrix = distance_matrix[np.ix_(treated_idx, control_idx)]
    
    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    
    matches = [(treated_idx[i], control_idx[j]) for i, j in zip(row_ind, col_ind)]
    
    return matches

# Step 4: Sensitivity Analysis
def sensitivity_analysis(matches, data, outcome_col):
    """Perform Wilcoxon signed-rank test to compare matched pairs."""
    treated_outcomes = [data.loc[t, outcome_col] for t, c in matches]
    control_outcomes = [data.loc[c, outcome_col] for t, c in matches]

    stat, p_value = wilcoxon(treated_outcomes, control_outcomes)
    return stat, p_value

# Step 5: Visualization of Matched Pairs
def plot_matched_pairs(matches, data, symptom_x="pain", symptom_y="urgency"):
    """Visualize matched pairs in a scatter plot."""
    plt.figure(figsize=(10, 6))
    sns.set(style="whitegrid")

    treated_x = []
    treated_y = []
    control_x = []
    control_y = []

    for t, c in matches:
        treated_x.append(data.loc[t, symptom_x])
        treated_y.append(data.loc[t, symptom_y])
        control_x.append(data.loc[c, symptom_x])
        control_y.append(data.loc[c, symptom_y])
        plt.plot([data.loc[t, symptom_x], data.loc[c, symptom_x]],
                 [data.loc[t, symptom_y], data.loc[c, symptom_y]], 
                 'gray', linestyle="dotted", alpha=0.6)

    plt.scatter(treated_x, treated_y, color='red', label="Treated Patients", alpha=0.7)
    plt.scatter(control_x, control_y, color='blue', label="Control Patients", alpha=0.7)

    plt.xlabel(symptom_x.capitalize())
    plt.ylabel(symptom_y.capitalize())
    plt.title("Matched Pairs: Treated vs. Control")
    plt.legend()
    plt.show()

# Example usage
if __name__ == "__main__":
    data = load_data("patient_data.csv")
    symptom_cols = ["pain", "urgency", "frequency"]
    
    matches = match_patients(data, treatment_col="treated", symptom_cols=symptom_cols)
    stat, p_value = sensitivity_analysis(matches, data, outcome_col="post_treatment_pain")

    print(f"Wilcoxon Test Statistic: {stat}, P-value: {p_value}")

    # Plot matched pairs
    plot_matched_pairs(matches, data, symptom_x="pain", symptom_y="urgency")
