In [11]:
### Packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [12]:
### Loading and cleaning dataset 1
nhanes = pd.read_csv("dataset1/NHANES_age_prediction.csv")

## Rename columns
nhanes = nhanes.rename(columns = {
    "SEQN":"Index",
    "RIDAGEYR":"Age",
    "RIAGENDR":"Gender",
    "PAQ605":"Fitness",
    "BMXBMI":"BMI",
    "LBXGLU":"Blood_glucose",
    "DIQ010":"Diabetic",
    "LBXGLT": "Oral",
    "LBXIN": "Insulin"
})

## Checking for missing values
# no missing values!
nhanes.isnull().sum()
nhanes.describe()

# Wonky value of 7 for 1 row in "Fitness"
nhanes["Fitness"].value_counts()
# Dropping the row:
nhanes = nhanes.drop(nhanes[nhanes["Fitness"] == 7].index)
# Verify:
nhanes["Fitness"].value_counts()

Fitness
2.0    1868
1.0     409
Name: count, dtype: int64

In [13]:
### Loading and cleaning dataset 2
bcw = pd.read_csv("dataset2/breast-cancer-wisconsin.csv")

# Removing the first column as it contains ids that we don't need
bcw = bcw.drop(bcw.columns[0], axis=1)

# Creating column names
column_names = ["clump_thickness","cell_uniformity","cell_shape",
                "marginal_adhesion","epithereal_cell_size","bare_nuclei",
                "bland_chromatin","normal_nucleoli","mitoses","class"]
bcw.columns = column_names

# Replacing all '?' characters with NaN
bcw.replace('?', np.nan, inplace=True)

# Converting all rows to numeric values, setting any rows that can't be converted to NaN
bcw = bcw.apply(pd.to_numeric, errors='coerce')

# Dropping all rows with NaN
bcw = bcw.dropna()

In [24]:
nhanes["BMI"].describe()

count    2277.000000
mean       27.958191
std         7.249129
min        14.500000
25%        22.800000
50%        26.800000
75%        31.200000
max        70.100000
Name: BMI, dtype: float64

In [None]:
### ----- Dataset 1 Summary stats ----- ###

## Gender
print("Gender proportions by age group:")
print(nhanes.groupby("age_group")["Gender"].value_counts(normalize = True))
# No apparent impact of gender upon age group

## Fitness
print("Fitness levels by age group:")
print(nhanes.groupby("age_group")["Fitness"].value_counts(normalize = True))
# Fitness does appear to predict/depend upon age group

## BMI
nhanes.groupby("age_group")["BMI"].describe()
# not crazy different, but may be signifcant

## Blood Glucose
nhanes.groupby("age_group")["Blood_glucose"].describe()
# the seniors have noticeably higher blood glucose levels

## Diabetic
nhanes["Diabetic"].value_counts()
# Values are 2, 3, 1. 2 means not-diabetic, don't know what 1 and 3 mean
print(nhanes.groupby("age_group")["Diabetic"].value_counts(normalize = True))
# Higher proportion of 1s and 3s among seniors -- prolly big indicator

## Oral
nhanes.groupby("age_group")["Oral"].describe()
# Much higher among seniors rather than adults

## Insulin
nhanes.groupby("age_group")["Insulin"].describe()
# Lower among seniors vs adults


## Variables to consider in KNN:
#
# - Fitness (categorical -- 2 levels)
# - BMI (cont.)
# - Blood glucose (cont.)
# - Diabetic (categorical -- 3 levels)
# - Oral (cont.)
# - Insulin (cont.)


In [None]:
## Distance functions

# Finds the euclidean distance between two rows in a Pandas DataFrame.
# It treats the values in each column of the rows as a point in a dimension in space.
def euclidean_distance(r1: pd.Series, r2: pd.Series):
    v1 = np.array(r1)
    v2 = np.array(r2)

    return  np.linalg.norm(v1 - v2)


## Need to consider a second distance function here


In [None]:
### Implementation of KNN

# This class will represent an instance of the KNN model with a static K
class KNN:

    def __init__(self, K = 1, dist_fn = euclidean_distance):
        self.dist_fn = dist_fn
        self.K = K

    # Memorizes the data
    def fit(self, training_features, training_target):
        self.x = training_features
        self.y = training_target

    # Predicts the labels of samples in the test dataset and returns them as a list
    def predict(self, test_data) -> list:

        predictions = []
        pred_probs = []

       # print("Testing data shap:", test_data.shape)
       # print(test_data.index)
       # print(type(test_data))

        # Looping through each point in the test dataset
        for i in range(0, len(test_data)):
            test_row = test_data.iloc[i]

            # Dictionary to hold the distances to each other point
            neighbors = {}

            # Getting the distance between current test point and all training points
            # j being the index of each training row
            for j in range(0, len(self.x)):
                train_row = self.x.iloc[j]

                # Taking the rows to find the distance between the two
                dist = self.dist_fn(test_row, train_row)
                neighbors[j] = dist

            # Finding the k-nearest neighbors
            nearest_neighbors = sorted(neighbors.items(), key=lambda x: x[1])[:self.K] # sorted list of tuples
            nearest_neighbors = [x[0] for x in  nearest_neighbors] # List containing only indexes of nearest neighbors

           # print("self.y.shape = ", self.y.shape)
          #  print("self.y index: \n\n", self.y.index)

            pred_classes = [self.y[i] for i in nearest_neighbors]

            class_probs = pd.Series(pred_classes).value_counts(normalize = True)

            # Getting the predicted label for the new points
            predictions.append(class_probs.index.to_list()[0])

            pred_probs.append(class_probs.iloc[0])

        return predictions, pred_probs

In [None]:
### KNN model 1: Dataset 1 with certain variables

# Model 1: continuous variables only for simplicity

nhanes_m1 = nhanes[["BMI", "Blood_glucose", "Oral", "Insulin"]]
nhanes_target = nhanes["age_group"]

print(nhanes_m1.head(), "\n")

## Step 1: splitting data into train, validation, test, roughly 50%, 25%, 25%

X_train, X_test, y_train, y_test = train_test_split(
    nhanes_m1, nhanes_target, test_size = 0.25, random_state = 21

)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size = 0.33, random_state=22
)

print("Training features array dimensions:", X_train.shape)
print("Training target array dimensions:", y_train.shape, "\n")

print("Validation features array dimensions:", X_valid.shape)
print("Validation target array dimensions:", y_valid.shape, "\n")

print("Test features array dimensions:", X_test.shape)
print("Test target array dimensions:", y_test.shape, "\n")

X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [None]:
## Running KNN on dataset 1
model_1 = KNN(K = 5)

model_1.fit(X_train, y_train)

print(y_train.shape)

m1_preds, m1_probs = model_1.predict(X_test)

accuracy = np.sum(m1_preds == y_test)/y_test.shape[0]

print(accuracy)
    
print(m1_probs[:10])
print(m1_preds[:10])
print(y_test[:10])