### Part I: kNN
---

Train kNN model

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd

data_df = pd.read_csv("NHANES_data_train.csv")

# Split the data into rows with MIs and those without
MIs = data_df[data_df["MI"] == 1]
noMIs = data_df[data_df["MI"] == 2]

iterations = 100
total = 0

# Change random state each iteration keep track of the one that gives the highest
for i in range(0, iterations):

    # Data with no MIs is a larger set than that with MIs, so okay to drop NaNs
    noMIs = noMIs.dropna()

    # Fill missing data in MIs using the imputer method considering 5 neighbors
    imputer = KNNImputer(n_neighbors=5)
    imputer.fit(MIs)
    imputed_data = imputer.transform(MIs)
    imputed_df = pd.DataFrame(imputed_data, columns=MIs.columns)

    undersample_noMIs = noMIs.sample(frac=0.11)

    # combine datasets
    # Ignore index to concatenate to the appropriate axis
    data = pd.concat([imputed_df, undersample_noMIs], ignore_index=True)

    # What variables make sense to include?
    X = data[['Sex', 'Age', 'Race', 'Diastolic',
        'Systolic', 'Pulse', 'BMI', 'HDL', 'Trig', 'LDL', 'TCHOL',
        'kidneys_eGFR', 'Diabetes', 'CurrentSmoker', 'isActive']]
    y = data['MI']

    # Split the dataset into random training and testing sets
    # To find random_state and n_neighbors, I ran two nested for loops which calculated accuracies for different combinations and found the highest
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=43)

    # Scale the data, and choose how many neighbors to consider
    clf2 = Pipeline(
        steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=58))]
    )

    # set model parameters from training set
    clf2.fit(X_train, y_train)

    # add the accuracy to total.. used to calculate the average accuracy 
    accuracy = clf2.score(X_test, y_test)
    total += accuracy

print("Average accuracy over ", iterations, "iterations:", total / iterations)


Average accuracy over  100 iterations: 0.7960810810810811


Run "NHANES_test_data_4_students.csv" through the trained kNN model

In [22]:
# read the dataset into a dataframe
data = pd.read_csv("NHANES_test_data_4_students.csv")

# Save participant IDS in a list for later
ParticipantID = data['ParticipantID'].tolist()

# choose the features we want to use from the dataset
data = data[['Sex', 'Age', 'Race', 'Diastolic',
        'Systolic', 'Pulse', 'BMI', 'HDL', 'Trig', 'LDL', 'TCHOL',
        'kidneys_eGFR', 'Diabetes', 'CurrentSmoker', 'isActive']]

# Fill missing data in using the imputer method considering 5 neighbors
imputer = KNNImputer(n_neighbors=5)
imputer.fit(data)
imputed_data = imputer.transform(data)
imputed_df = pd.DataFrame(imputed_data, columns=data.columns)

# run the model with this data
new_pred = clf2.predict_proba(imputed_df)

# predict_proba gives us positive and negative probabilities
# take the probability of the POSITIVE predictions (probability of a participant suffering an MI in the near future) and place in a list
Pred_Probability = []
for i in new_pred:
    Pred_Probability.append(i[1])

results = pd.DataFrame({'ParticipantID': ParticipantID, 'Pred_Probability': Pred_Probability})
results.to_csv("kNN_pred.csv", index=False)


### Part II: Logistic Regression
---

Train Logistic Regression model

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd

data_df = pd.read_csv("NHANES_data_train.csv")
total = 0
interations = 100

# Split the data into rows with MIs and those without
for i in range(0, iterations):
    MIs = data_df[data_df["MI"] == 1]
    noMIs = data_df[data_df["MI"] == 2]

    # Data with no MIs is a larger set than that with MIs, so okay to drop NaNs
    noMIs = noMIs.dropna()

    # Fill missing data in MIs using the imputer method considering 5 neighbors
    imputer = KNNImputer(n_neighbors=5)
    imputer.fit(MIs)
    imputed_data = imputer.transform(MIs)
    imputed_df = pd.DataFrame(imputed_data, columns=MIs.columns)

    undersample_noMIs = noMIs.sample(frac=0.11)

    # combine datasets
    # Ignore index to concatenate to the appropriate axis
    data = pd.concat([imputed_df, undersample_noMIs], ignore_index=True)

    # What variables make sense to include?
    X = data[['Sex', 'Age', 'Race', 'Diastolic',
        'Systolic', 'Pulse', 'BMI', 'HDL', 'Trig', 'LDL', 'TCHOL',
        'kidneys_eGFR', 'Diabetes', 'CurrentSmoker', 'isActive']]
    y = data['MI']

    # Split the dataset into random training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=5)

    logreg = LogisticRegression(C=1000, max_iter=100000).fit(X_train, y_train)

    total += logreg.score(X_test, y_test)

# from sklearn.metrics import confusion_matrix
# y_predicted = logreg.predict(X_test)
# print("confusion matrix:\n", confusion_matrix(y_test, y_predicted))

# print("Training set score: {:.3f}".format(logreg.score(X_train, y_train))) 
# print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

print("Average accuracy over ", iterations, "iterations:", total / iterations)

Average accuracy over  100 iterations: 0.7571621621621623


Run "NHANES_test_data_4_students.csv" through the trained logistic regression model

In [40]:
# read the dataset into a dataframe
data = pd.read_csv("NHANES_test_data_4_students.csv")

# Save participant IDS in a list for later
ParticipantID = data['ParticipantID'].tolist()

# choose the features we want to use from the dataset
data = data[['Sex', 'Age', 'Race', 'Diastolic',
        'Systolic', 'Pulse', 'BMI', 'HDL', 'Trig', 'LDL', 'TCHOL',
        'kidneys_eGFR', 'Diabetes', 'CurrentSmoker', 'isActive']]

# Fill missing data in using the imputer method considering 5 neighbors
imputer = KNNImputer(n_neighbors=5)
imputer.fit(data)
imputed_data = imputer.transform(data)
imputed_df = pd.DataFrame(imputed_data, columns=data.columns)

# run the model with this data
new_pred = logreg.predict_proba(imputed_df)

# predict_proba gives us positive and negative probabilities
# take the probability of the POSITIVE predictions (probability of a participant suffering an MI in the near future) and place in a list
Pred_Probability = []
for i in new_pred:
    Pred_Probability.append(i[1])

results = pd.DataFrame({'ParticipantID': ParticipantID, 'Pred_Probability': Pred_Probability})
results.to_csv("regression_pred.csv", index=False)
