In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.metrics import confusion_matrix

import pandas as pd

# Load dataset
dataset = pd.read_csv("NHANES_data_train.csv")
MI_positive = dataset[dataset['MI'] == 1]
MI_negitive = dataset[dataset['MI'] == 2]

# Limit number of MI positive samples
MI_negitive = MI_negitive.sample(frac=.100)
dataset = pd.concat([MI_positive, MI_negitive])

# Impute columns with nearest co-value
dataset_for_imp = dataset.drop(columns=['ParticipantID']) # ignore the ID column
imputed_data = pd.DataFrame(KNNImputer(n_neighbors=3).fit_transform(dataset_for_imp), columns=dataset_for_imp.columns)

# Add some advanced metrics
imputed_data['bloodPressure'] = imputed_data['Systolic']/imputed_data['Diastolic']
imputed_data['kidneyAge'] = (1/imputed_data['kidneys_eGFR']) * imputed_data['Age']
imputed_data['ageAge'] = imputed_data['Age']*imputed_data['Age']

# Scale the data in the dataset, using values most relevent for prediction of MI
X = imputed_data[['Income', 'Age', 'Diastolic', 'Systolic', 'Pulse', 'BMI', 'HDL', 'Trig', 'LDL', 'TCHOL', 'kidneys_eGFR', 'Diabetes', 'bloodPressure', 'kidneyAge', 'ageAge']]
Y = imputed_data['MI']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,stratify=Y, random_state=10)

# Train the model
clf = Pipeline(
    steps=[("scaler", StandardScaler()), 
           ("knn", KNeighborsClassifier(n_neighbors=11))]
)
clf.fit(X_train, Y_train)

# Make prediction
probabilities = clf.predict_proba(X_test)[:, 1] # for output
y_pred = clf.predict(X_test) # for confusion matrix

# cof mat
conf_mat = confusion_matrix(Y_test, y_pred)
print(conf_mat)

# Get each sample's ID and probability, and write them to the output csv
dataset.reset_index(drop=True, inplace=True) # reset index
participant_ids = dataset.loc[X_test.index, 'ParticipantID']
output_df = pd.DataFrame({'ParticipantID': participant_ids, 'Probability_MI_Correct': probabilities})
output_df.to_csv('kNN_pred.csv', index=False)

# Merge the dataset with prediction set for accuracy evaluation, and send it to the csv file
orig_testing_data = pd.read_csv("NHANES_data_train.csv")
kNN_prediction = pd.read_csv("kNN_pred.csv")
merged_data = pd.merge(orig_testing_data, kNN_prediction, on='ParticipantID', how='inner')
kNN_true_MI_Label = merged_data[['ParticipantID', 'MI']]
kNN_true_MI_Label.to_csv('kNN_true_MI_Label.csv', index=False)


[[10 26]
 [ 9 90]]


In [2]:
from sklearn.linear_model import LogisticRegression

# Train the model using logistic regression
log_reg_model = Pipeline(
    steps=[("scaler", StandardScaler()), 
           ("log_reg", LogisticRegression())]
)
log_reg_model.fit(X_train, Y_train)

# Make predictions using logistic regression
reg_probabilities = log_reg_model.predict_proba(X_test)[:, 1]
reg_y_pred = clf.predict(X_test) # for confusion matrix

# cof mat
conf_mat = confusion_matrix(Y_test, reg_y_pred)
print(conf_mat)

# Get each sample's ID and probability, and write them to the output csv
dataset.reset_index(drop=True, inplace=True) # reset index
participant_ids = dataset.loc[X_test.index, 'ParticipantID']
output_df = pd.DataFrame({'ParticipantID': participant_ids, 'Probability_MI_Correct': reg_probabilities})
output_df.to_csv('regression_pred.csv', index=False)

# Merge the dataset with prediction set for accuracy evaluation, and send it to the csv file
orig_testing_data = pd.read_csv("NHANES_data_train.csv")
regression_prediction = pd.read_csv("regression_pred.csv")
merged_data = pd.merge(orig_testing_data, regression_prediction, on='ParticipantID', how='inner')
regression_true_MI_Label = merged_data[['ParticipantID', 'MI']]
regression_true_MI_Label.to_csv('regression_true_MI_Label.csv', index=False)

[[10 26]
 [ 9 90]]
