In [15]:
### Packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [16]:
### Loading and cleaning dataset 1
nhanes = pd.read_csv("dataset1/NHANES_age_prediction.csv")

## Rename columns
nhanes = nhanes.rename(columns = {
    "SEQN":"Index",
    "RIDAGEYR":"Age",
    "RIAGENDR":"Gender",
    "PAQ605":"Fitness",
    "BMXBMI":"BMI",
    "LBXGLU":"Blood_glucose",
    "DIQ010":"Diabetic",
    "LBXGLT": "Oral",
    "LBXIN": "Insulin"
})

## Checking for missing values
# no missing values!
nhanes.isnull().sum()
nhanes.describe()

# Wonky value of 7 for 1 row in "Fitness"
nhanes["Fitness"].value_counts()
# Dropping the row:
nhanes = nhanes.drop(nhanes[nhanes["Fitness"] == 7].index)
# Verify:
nhanes["Fitness"].value_counts()

2.0    1868
1.0     409
Name: Fitness, dtype: int64

In [29]:
### Loading and cleaning dataset 2
bcw = pd.read_csv("dataset2/breast-cancer-wisconsin.csv")

# Removing the first column as it contains ids that we don't need
bcw = bcw.drop(bcw.columns[0], axis=1)

# Creating column names
column_names = ["clump_thickness","cell_uniformity","cell_shape",
                "marginal_adhesion","epithereal_cell_size","bare_nuclei",
                "bland_chromatin","normal_nucleoli","mitoses","class"]
bcw.columns = column_names

# Replacing all '?' characters with NaN
bcw.replace('?', np.nan, inplace=True)

# Converting all rows to numeric values, setting any rows that can't be converted to NaN
bcw = bcw.apply(pd.to_numeric, errors='coerce')

# Dropping all rows with NaN
bcw = bcw.dropna()

bcw["class"] = [0 if x == 2 else 1 for x in bcw["class"]]

bcw = bcw.rename(columns = {"class": "malignant"})

bcw.head()

Unnamed: 0,clump_thickness,cell_uniformity,cell_shape,marginal_adhesion,epithereal_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,malignant
0,5,4,4,5,7,10.0,3,2,1,0
1,3,1,1,1,2,2.0,3,1,1,0
2,6,8,8,1,3,4.0,3,7,1,0
3,4,1,1,3,2,1.0,3,1,1,0
4,8,10,10,8,7,10.0,9,7,1,1


In [18]:
### ----- Dataset 1 Summary stats ----- ###

## Gender
print("Gender proportions by age group:")
print(nhanes.groupby("age_group")["Gender"].value_counts(normalize = True),
      "\n\n")
# No apparent impact of gender upon age group

## Fitness
nhanes["Fitness"] = [0 if x == 2 else 1 for x in nhanes["Fitness"]]

print("Fitness levels by age group:")
print(nhanes.groupby("age_group")["Fitness"].value_counts(normalize = True),
      "\n\n")
# Fitness does appear to predict/depend upon age group

## BMI
print("BMI summary by age group:\n",
      nhanes.groupby("age_group")["BMI"].describe(),
      "\n\n")
# not crazy different, but may be signifcant

## Blood Glucose
print("Blood glucose summary by age group:\n",
      nhanes.groupby("age_group")["Blood_glucose"].describe(),
      "\n\n")
# the seniors have noticeably higher blood glucose levels

## Diabetic
# 1: Yes diabetes
# 2: No diabetes (to be -> 0)
# 3: Borderline (to be -> 1)

nhanes["Diabetic"] = [0 if x == 2 else 1 for x in nhanes["Diabetic"]]


print("Diabetic value counts by age group:\n",
      nhanes["Diabetic"].value_counts(),
      "\n\n")
# Values are 2, 3, 1. 2 means not-diabetic, don't know what 1 and 3 mean
print(nhanes.groupby("age_group")["Diabetic"].value_counts(normalize = True))
# Higher proportion of 1s and 3s among seniors -- prolly big indicator

## Oral
nhanes.groupby("age_group")["Oral"].describe()
# Much higher among seniors rather than adults

## Insulin
nhanes.groupby("age_group")["Insulin"].describe()
# Lower among seniors vs adults


## Variables to consider in KNN:
#
# - Fitness (categorical -- 2 levels)
# - BMI (cont.)
# - Blood glucose (cont.)
# - Diabetic (categorical -- 3 levels)
# - Oral (cont.)
# - Insulin (cont.)


Gender proportions by age group:
age_group  Gender
Adult      2.0       0.512284
           1.0       0.487716
Senior     2.0       0.508242
           1.0       0.491758
Name: Gender, dtype: float64 


Fitness levels by age group:
age_group  Fitness
Adult      0          0.803450
           1          0.196550
Senior     0          0.909341
           1          0.090659
Name: Fitness, dtype: float64 


BMI summary by age group:
             count       mean       std   min   25%   50%   75%   max
age_group                                                           
Adult      1913.0  27.971877  7.526883  14.5  22.6  26.8  31.4  70.1
Senior      364.0  27.886264  5.574166  16.8  24.2  27.2  30.6  52.2 


Blood glucose summary by age group:
             count        mean        std   min   25%    50%    75%    max
age_group                                                                
Adult      1913.0   98.638787  18.258651  63.0  91.0   96.0  103.0  405.0
Senior      364.0  104.3296

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Adult,1913.0,12.110774,10.061061,0.14,5.99,9.2,14.8,102.29
Senior,364.0,10.405247,7.530538,1.02,5.2475,8.465,13.2125,52.89


In [19]:
## Distance functions

euclidean = lambda x1, x2: np.sqrt(np.sum((x1 - x2)**2, axis=-1))
manhattan = lambda x1, x2: np.sum(np.abs(x1 - x2), axis=-1)


In [20]:
### Implementation of KNN

# This class will represent an instance of the KNN model with a static K
## Distance functions:
euclidean = lambda x1, x2: np.sqrt(np.sum((x1 - x2)**2, axis=-1))
manhattan = lambda x1, x2: np.sum(np.abs(x1 - x2), axis=-1)

## Defining the KNN class
class KNN:

    def __init__(self, K = 1, dist_fn = euclidean):
        self.dist_fn = dist_fn
        self.K = K
        return
    
    def fit(self, x, y):
        self.x = x
        self.y = y
        self.C = np.max(y) + 1
        return self
    
    def predict(self, x_test):
        num_test = x_test.shape[0]

        distances = self.dist_fn(self.x[None,:,:], x_test[:,None,:])
        #ith-row of knns stores the indices of k closest training samples to the ith-test sample 
        knns = np.zeros((num_test, self.K), dtype=int)
        #ith-row of y_prob has the probability distribution over C classes
        y_prob = np.zeros((num_test, self.C))
        for i in range(num_test):
            # print(i)
            knns[i,:] = np.argsort(distances[i])[:self.K]
            # print(knns[i,:])
            y_prob[i,:] = np.bincount(self.y[knns[i,:]], minlength=self.C) #counts the number of instances of each class in the K-closest training samples
        #y_prob /= np.sum(y_prob, axis=-1, keepdims=True)
        #simply divide by K to get a probability distribution
        y_prob /= self.K
        return y_prob, knns


In [21]:
## Scaling for Dataset 1

def feature_normalization(dataset):
    for c in dataset.columns:
        mean = dataset[c].mean()
        st_dev = dataset[c].std()

        dataset[c].apply(lambda x: (x - mean)/st_dev)

In [24]:
### KNN model 1: Dataset 1 with certain variables

# Model 1: continuous variables only for simplicity

nhanes["age_group"] = [0 if x == "Adult" else 1 for x in nhanes["age_group"]]

nhanes_m1 = nhanes[["BMI", "Blood_glucose", "Oral", "Insulin"]]
nhanes_target = nhanes["age_group"]


## Step 1: splitting data into train, validation, test, roughly 50%, 25%, 25%

X_train, X_test, y_train, y_test = train_test_split(
    nhanes_m1, nhanes_target, test_size = 0.25, random_state = 21

)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size = 0.33, random_state=22
)

print("Training features array dimensions:", X_train.shape)
print("Training target array dimensions:", y_train.shape, "\n")

print("Validation features array dimensions:", X_valid.shape)
print("Validation target array dimensions:", y_valid.shape, "\n")

print("Test features array dimensions:", X_test.shape)
print("Test target array dimensions:", y_test.shape, "\n")

X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

Training features array dimensions: (1143, 4)
Training target array dimensions: (1143,) 

Validation features array dimensions: (564, 4)
Validation target array dimensions: (564,) 

Test features array dimensions: (570, 4)
Test target array dimensions: (570,) 



In [25]:
## Running KNN on dataset 1
model_1 = KNN(K = 5)

model_1.fit(X_train, y_train)

print(y_train.shape)

m1_preds, m1_probs = model_1.predict(X_test)

accuracy = np.sum(m1_preds == y_test)/y_test.shape[0]

print(accuracy)
    
print(m1_probs[:10])
print(m1_preds[:10])
print(y_test[:10])

(1143,)


InvalidIndexError: (None, slice(None, None, None), slice(None, None, None))