In [None]:
import pandas as pd

df = pd.read_csv('negative.csv', na_values=['NULL'])

df['AcquisitionDateTime_DT'] = pd.to_datetime(df['AcquisitionDateTime_DT'])

print(df.head())

print(df.info())
print(df.isnull().sum())

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


In [None]:
y = df['MI_Phys']

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['PatientID', '12SL_Codes', 'Phys_Codes','TestID', 'Source', 'Gender', 'PatientAge', 'AcquisitionDateTime_DT', 'MI_Phys']), y, test_size=0.2, random_state=42)

y_12SL = X_test['MI_12SL']

#drop the 12SL column
X_test = X_test.drop(columns=['MI_12SL'])
X_train = X_train.drop(columns=['MI_12SL'])


In [None]:
#count number of positive and negative smales based on MI_Phys
print(df['MI_Phys'].value_counts())

In [None]:
estimators = [0.15, 0.11, 0.1, 0.09, 0.08, 0.07]
leaves = [15, 31, 63, 127]
scores = {}
for estims in estimators:
    for num_leaves in leaves:
        lgb_params = {
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'n_estimators': 100,
            'learning_rate': estims,
            'random_state': 42,
            'verbose': -1,
            'num_leaves': num_leaves,
            #'scale_pos_weight': 63927/3968,
            #'is_unbalance': True
            }
        model = (lgb.LGBMClassifier(**lgb_params))
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = f1_score(y_test, y_pred, average='micro')
        scores[(estims, num_leaves)] = score
        print(f"Learning rate: {estims}, Number of Leaves: {num_leaves} F1 score: {score:.4f}")

In [None]:
lgb_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'n_estimators': 200,
    'learning_rate': 0.15,
    'random_state': 42,
    'verbose': -1,
    'num_leaves': 127,
    'scale_pos_weight': 63927/3968,
    #'is_unbalance': True

    }
model = (lgb.LGBMClassifier(**lgb_params))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = f1_score(y_test, y_pred, average='micro')
print("Final F1 score: ", score)





In [None]:
#confusion matrix 12SL (no model)
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


cm = confusion_matrix(y_test, y_12SL)
sns.heatmap(cm, annot=True, fmt='d')
plt.show()


#confusion matrix (from model)


cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming 'model' is your trained LightGBM model
importances = model.feature_importances_
feature_names = X_train.columns if hasattr(X_train, 'columns') else np.arange(len(importances))

# Create a bar plot of the feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(len(importances)), importances)
plt.xticks(range(len(importances)), feature_names, rotation='vertical')
plt.title("Feature Importances")
plt.xlabel("Features")
plt.ylabel("Importance Score")
plt.tight_layout()
plt.show()


In [None]:
# 'gbm' is the model returned by lgb.train
booster = model.booster_
importances = booster.feature_importance(importance_type='split')  # or use 'gain'
feature_names = booster.feature_name()

lgb.plot_importance(booster, importance_type='gain', max_num_features=20)
plt.show()


In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_cols = df.drop(columns=['PatientID', '12SL_Codes', 'Phys_Codes','TestID', 'Source', 'MI_12SL', 'Gender', 'PatientAge', 'AcquisitionDateTime_DT', 'MI_Phys'])
# drop NaN values
df_cols = df_cols.dropna()
X_scaled = scaler.fit_transform(df_cols)

pca = PCA(n_components=0.95, random_state=42)  # keep 95% of variance
principal_components = pca.fit_transform(X_scaled)

print("Number of components selected:", pca.n_components_)
print("Total variance explained:", pca.explained_variance_ratio_.sum())
print("Variance explained by each component:", pca.explained_variance_ratio_)
print("Principal components shape:", principal_components.shape)
#column names from original data
feature_names = df_cols.columns
# Create a DataFrame with the principal components
df_pca = pd.DataFrame(data=principal_components, columns=[f"PC{i}" for i in range(1, pca.n_components_ + 1)])

# Plot the explained variance ratio
plt.figure(figsize=(10, 6))
plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_)
plt.title("Explained Variance Ratio of Principal Components")
plt.xlabel("Principal Component")
plt.ylabel("Explained Variance Ratio")
plt.xticks(range(1, pca.n_components_ + 1))
plt.show()

In [None]:
df_cols_Y = df.drop(columns=['PatientID', '12SL_Codes', 'Phys_Codes','TestID', 'Source', 'MI_12SL', 'Gender', 'PatientAge', 'AcquisitionDateTime_DT'])
# drop NaN values
df_cols_Y = df_cols_Y.dropna()
y = df_cols_Y['MI_Phys']
X_train, X_test, y_train, y_test = train_test_split(df_pca, y, test_size=0.2, random_state=42)
#count number of positive and negative smales based on MI_Phys
print(df_cols_Y['MI_Phys'].value_counts())



In [None]:
estimators = [0.25, 0.2, 0.15, 0.1, 0.09]
leaves = [15, 31, 63, 127]
scores = {}
for estims in estimators:
    for num_leaves in leaves:
        lgb_params = {
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'n_estimators': 100,
            'learning_rate': estims,
            'random_state': 42,
            'verbose': -1,
            'num_leaves': num_leaves,
            
            }
        model = (lgb.LGBMClassifier(**lgb_params))
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = f1_score(y_test, y_pred, average='micro')
        scores[(estims, num_leaves)] = score
        print(f"Learning rate: {estims}, Number of Leaves: {num_leaves} F1 score: {score:.4f}")

In [None]:
lgb_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'n_estimators': 200,
    'learning_rate': 0.15,
    'random_state': 42,
    'verbose': -1,
    'num_leaves': 127,
    'scale_pos_weight': 50935/3239,
    #'is_unbalance': True

    }
model = (lgb.LGBMClassifier(**lgb_params))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = f1_score(y_test, y_pred, average='micro')
print("Final F1 score: ", score)


In [None]:
#confusion matrix 12SL (no model)
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


# cm = confusion_matrix(y_test, y_12SL)
# sns.heatmap(cm, annot=True, fmt='d')
# plt.show()


#confusion matrix (from model)


cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.show()

In [None]:
from imblearn.over_sampling import SMOTE

# Rebalance training data using SMOTE
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)
#do not use this, here just to see the data distribution
print("Before SMOTE class distribution:\n", y_train.value_counts())
print("After SMOTE class distribution:\n", y_train_bal.value_counts())

In [None]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, classification_report, fbeta_score
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd

# Define base models (use 1 linear to make it diverse)
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('lgbm', LGBMClassifier(n_estimators=200, learning_rate=0.15, num_leaves=127, random_state=42)),
    ('logreg', LogisticRegression(max_iter=1000, random_state=42))
]

# Define meta model (stacking model)
meta_model = LGBMClassifier(n_estimators=50, learning_rate=0.05, random_state=42)

# Stacking ensemble with passthrough
stacked_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    passthrough=False,
    n_jobs=-1
)

# Fit and evaluate
stacked_model.fit(X_train, y_train)
# This keeps feature names consistent
y_pred_stack = stacked_model.predict(pd.DataFrame(X_test, columns=X_train.columns))
f1_stack_micro = f1_score(y_test, y_pred_stack, average='micro')
f1_stack_macro = f1_score(y_test, y_pred_stack, average='macro')

print("Stacked Ensemble (RF + LGBM + Logistic) F1 Score Micro:", f1_stack_micro)
print("Stacked Ensemble (RF + LGBM + Logistic) F1 Score Macro:", f1_stack_macro)

In [None]:
from sklearn.metrics import classification_report

print("=== LightGBM ===")
print(classification_report(y_test, y_pred))

print("=== Stacking (RF + LGBM + Logistic) ===")
print(classification_report(y_test, y_pred_stack))
