<a href="https://colab.research.google.com/github/clv07/stroke-of-luck/blob/Data-import/Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# LOAD IN DATA
import pandas as pd

df1 = pd.read_csv('positive.csv', na_values=['NULL'])

df2 = pd.read_csv('negative.csv', na_values=['NULL'])

df = pd.concat([df1, df2], ignore_index=True)

df['AcquisitionDateTime_DT'] = pd.to_datetime(df['AcquisitionDateTime_DT'])

#print(df.head())
#print(df.info())
#print(df.isnull().sum())

In [2]:
# GET MODEL TO WORK IN COLAB W/ GPU (GO TO EDIT -> NOTEBOOK SETTINGS -> GPU)
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd

In [3]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [4]:
from sklearn.model_selection import train_test_split
import pandas as pd

y = df["MI_Phys"]

X = df.drop(columns=["PatientID", "12SL_Codes", "Phys_Codes", "TestID", "Source",
                     "Gender", "PatientAge", "AcquisitionDateTime_DT", "MI_Phys", "POffset", "PAxis", "POnset"])
X = X.loc[:, ~X.columns.str.contains('P_')]
X = X.loc[:, ~X.columns.str.contains('Full')]
X = X.loc[:, ~X.columns.str.contains('Rate')]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Save the original algorithm's prediction for test set before dropping it
y_12SL = X_test["MI_12SL"]

# Split data based on MI_12SL classification
X_train_pos = X_train[X_train["MI_12SL"] == 1].drop(columns=["MI_12SL"])
X_train_neg = X_train[X_train["MI_12SL"] == 0].drop(columns=["MI_12SL"])
X_test_pos = X_test[X_test["MI_12SL"] == 1].drop(columns=["MI_12SL"])
X_test_neg = X_test[X_test["MI_12SL"] == 0].drop(columns=["MI_12SL"])

# Ensure y labels match the correct samples
y_train_pos = y_train.loc[X_train_pos.index]  # True positives or false positives
y_train_neg = y_train.loc[X_train_neg.index]  # True negatives or false negatives
y_test_pos = y_test.loc[X_test_pos.index]
y_test_neg = y_test.loc[X_test_neg.index]

X_train = X_train.drop(columns=["MI_12SL"])
X_test = X_test.drop(columns=["MI_12SL"])

# Extract MI_12SL predictions
y_12SL_pos = y_12SL.loc[X_test_pos.index]  # Original classifier's labels
y_12SL_neg = y_12SL.loc[X_test_neg.index]

In [5]:
# Positive Model
lgb_params = {
    'objective': 'binary',
    'boosting_type': 'dart',
    'colsample_bytree': 0.9889,
    'n_estimators': 1000,
    'learning_rate': 0.0807,
    'random_state': 42,
    'verbose': -1,
    'num_leaves': 106,
    'min_child_weight': 1,
    'max_depth' : 7,
    'reg_alpha' : 0.7909,
    'reg_lambda' : 0.61348,
    'subsample' : 0.903,
    # 'metric': 'auc',
    'scale_pos_weight': 3.537,
    'device': 'gpu',
    }
model_pos = (lgb.LGBMClassifier(**lgb_params))
model_pos.fit(X_train_pos, y_train_pos)
y_pred_pos = model_pos.predict(X_test_pos)
score = f1_score(y_test_pos, y_pred_pos, average='micro')
print("Final F1 score: ", score)

Final F1 score:  0.7859021567596002


In [7]:
# Add predictions as a new column to the test set
X_test_pos_with_preds = X_test_pos.copy()
X_test_pos_with_preds["MI_Predicted"] = y_pred_pos

# Get original data rows from `df` that correspond to the same indices as X_test_pos
df_test_pos_original = df.loc[X_test_pos.index]

# Join predictions back to the original dataset
df_with_predictions = df_test_pos_original.copy()
df_with_predictions["MI_Predicted"] = y_pred_pos

# Separate into positive and negative predictions
df_mi_detected = df_with_predictions[df_with_predictions["MI_Predicted"] == 1]
df_no_mi_detected = df_with_predictions[df_with_predictions["MI_Predicted"] == 0]

In [6]:
# Negative Model
lgb_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'n_estimators': 200,
    'learning_rate': 0.1,
    'random_state': 42,
    'verbose': -1,
    'num_leaves': 127,
    'scale_pos_weight': 10,
    'device': 'gpu',
    }
model_neg = (lgb.LGBMClassifier(**lgb_params))
model_neg.fit(X_train_neg, y_train_neg)
y_pred_neg = model_neg.predict(X_test_neg)
score = f1_score(y_test_neg, y_pred_neg, average='micro')
print("Final F1 score: ", score)

Final F1 score:  0.9538450195384502


In [9]:
# Add predictions to the test set
X_test_neg_with_preds = X_test_neg.copy()
X_test_neg_with_preds["MI_Predicted"] = y_pred_neg

# Get corresponding original rows from df
df_test_neg_original = df.loc[X_test_neg.index]

# Join predictions back to the original dataset
df_with_predictions_neg = df_test_neg_original.copy()
df_with_predictions_neg["MI_Predicted"] = y_pred_neg

# Separate based on prediction outcome
df_mi_detected_neg = df_with_predictions_neg[df_with_predictions_neg["MI_Predicted"] == 1]
df_no_mi_detected_neg = df_with_predictions_neg[df_with_predictions_neg["MI_Predicted"] == 0]

In [11]:
#Combine the two datasets
df_combined_no_mi = pd.concat([df_no_mi_detected, df_no_mi_detected_neg], ignore_index=True)

#Drop any prediction columns or non-feature columns
df_combined_no_mi = df_combined_no_mi.drop(columns=["MI_Predicted"], errors="ignore")

#Prepare features for prediction
X_combined = df_combined_no_mi.drop(columns=["PatientID", "12SL_Codes", "Phys_Codes", "TestID", "Source",
                                             "Gender", "PatientAge", "AcquisitionDateTime_DT", "MI_Phys",
                                             "POffset", "PAxis", "POnset"], errors='ignore')

X_combined = X_combined.loc[:, ~X_combined.columns.str.contains('P_')]
X_combined = X_combined.loc[:, ~X_combined.columns.str.contains('Full')]
X_combined = X_combined.loc[:, ~X_combined.columns.str.contains('Rate')]
X_combined = X_combined.drop(columns=["MI_12SL"], errors='ignore')  # also drop MI_12SL if it's still there

# Confirm shape matches the model's training input
assert X_combined.shape[1] == model_neg.n_features_in_, f"Expected {model_neg.n_features_in_} features, got {X_combined.shape[1]}"

# Step 4: Predict again
y_combined_pred = model_neg.predict(X_combined)

# Step 5: Reattach predictions
df_combined_no_mi["MI_Predicted_Again"] = y_combined_pred

# Separate based on predictions
df_still_no_mi = df_combined_no_mi[df_combined_no_mi["MI_Predicted_Again"] == 0]
df_new_mi_detected = df_combined_no_mi[df_combined_no_mi["MI_Predicted_Again"] == 1]


In [15]:
# Drop duplicate prediction columns if they exist
df_mi_detected = df_mi_detected.drop(columns=["MI_Predicted"], errors="ignore")
df_mi_detected_neg = df_mi_detected_neg.drop(columns=["MI_Predicted"], errors="ignore")
df_new_mi_detected = df_new_mi_detected.drop(columns=["MI_Predicted_Again"], errors="ignore")

# Combine all MI-detected datasets
df_all_mi_detected = pd.concat(
    [df_mi_detected, df_mi_detected_neg, df_new_mi_detected],
    ignore_index=True
)

# drop duplicates by PatientID
df_all_mi_detected = df_all_mi_detected.drop_duplicates(subset=["PatientID"])

In [17]:
#Tag predictions based on ground truth
df_all_mi_detected["Prediction"] = "Positive"
df_still_no_mi["Prediction"] = "Negative"

#Combine both datasets for evaluation
df_eval = pd.concat([df_all_mi_detected, df_still_no_mi], ignore_index=True)

#Classify each row based on actual vs. predicted
def classify_row(row):
    if row["MI_Phys"] == 1 and row["Prediction"] == "Positive":
        return "True Positive"
    elif row["MI_Phys"] == 0 and row["Prediction"] == "Positive":
        return "False Positive"
    elif row["MI_Phys"] == 0 and row["Prediction"] == "Negative":
        return "True Negative"
    elif row["MI_Phys"] == 1 and row["Prediction"] == "Negative":
        return "False Negative"
    else:
        return "Unknown"

df_eval["PredictionType"] = df_eval.apply(classify_row, axis=1)

#Count results
summary = df_eval["PredictionType"].value_counts()
print("Performance Summary:")
print(summary)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_still_no_mi["Prediction"] = "Negative"


Performance Summary:
PredictionType
True Negative     13399
True Positive      1007
False Negative      587
False Positive      471
Name: count, dtype: int64


In [18]:
# Map prediction types to binary labels for evaluation
df_eval["MI_Predicted_Final"] = df_eval["PredictionType"].map({
    "True Positive": 1,
    "False Positive": 1,
    "True Negative": 0,
    "False Negative": 0
})

y_true = df_eval["MI_Phys"]
y_pred = df_eval["MI_Predicted_Final"]

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred))
print("Accuracy:", accuracy_score(y_true, y_pred))
print("F1 Score:", f1_score(y_true, y_pred))

Confusion Matrix:
 [[13399   471]
 [  587  1007]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.96     13870
           1       0.68      0.63      0.66      1594

    accuracy                           0.93     15464
   macro avg       0.82      0.80      0.81     15464
weighted avg       0.93      0.93      0.93     15464

Accuracy: 0.931583031557165
F1 Score: 0.6555989583333334
