In [4]:
import pandas as pd
import os
import numpy as np

THRESHOLD = 2
ROWS = 0
COLUMNS = 1

# depending on the OS the path to the data file is different
if os.name == 'nt':
    data = pd.read_excel(r'..\data\nutrient-file-release2-jan22.xlsx', sheet_name='All solids & liquids per 100g')
elif os.name == 'posix':
    data = pd.read_excel(r'../data/nutrient-file-release2-jan22.xlsx', sheet_name='All solids & liquids per 100g')

# replace 0 with NaN
data = data.replace({0: np.nan})
    
# get the list of row/column names before dropping
before_cols = data.columns
before_rows = data.index

# remove rows/columns with less than (THRESHOLD) non-null values
data = data.dropna(thresh=THRESHOLD, axis=COLUMNS)
data = data.dropna(thresh=THRESHOLD, axis=ROWS)

# get the list of row/column names after dropping
after_cols = data.columns
after_rows = data.index

# find the removed rows/columns
removed_rows = [row for row in before_rows if row not in after_rows]
removed_cols = [col for col in before_cols if col not in after_cols]

# print the removed rows/columns
print("Removed" , len(removed_rows), "rows")
print("Removed" , len(removed_cols), "columns")
print("Removed rows: ", removed_rows)
print("Removed columns: ", removed_cols)

Removed 0 rows
Removed 48 columns
Removed rows:  []
Removed columns:  ['Glycerol \n(g)', 'Erythritol \n(g)', 'Maltitol \n(g)', 'Xylitol \n(g)', 'Oligosaccharides  \n(g)', 'Polydextrose \n(g)', 'Fumaric acid \n(g)', 'Shikimic acid \n(g)', '25-hydroxy ergocalciferol (25-OH D2) \n(ug)', 'Delta tocotrienol \n(mg)', 'Gamma tocotrienol \n(mg)', 'C23 (%T)', 'C10:1 (%T)', 'C18:1w5 (%T)', 'C18:1w6 (%T)', 'C18:1w7 (%T)', 'C18:1w9 (%T)', 'C20:1w9 (%T)', 'C20:1w13 (%T)', 'C20:1w11 (%T)', 'C22:1w9 (%T)', 'C22:1w11 (%T)', 'C24:1w9 (%T)', 'C24:1w11 (%T)', 'C24:1w13 (%T)', 'C12:2 (%T)', 'C16:3 (%T)', 'C20:3 (%T)', 'C20:4 (%T)', 'C23 (g)', 'C10:1 (g)', 'C12:1 (g)', 'C18:1w5 (mg)', 'C18:1w6 (mg)', 'C18:1w7 (g)', 'C18:1w9 (mg)', 'C20:1w9 (mg)', 'C20:1w13 (mg)', 'C20:1w11 (mg)', 'C22:1w9 (mg)', 'C22:1w11 (mg)', 'C24:1w9 (mg)', 'C24:1w11 (mg)', 'C24:1w13 (mg)', 'C12:2 (g)', 'C16:3 (g)', 'C20:3 (mg)', 'C20:4 (g)']


## $k$-NN

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# split data into train and test sets
train, test = train_test_split(data, test_size=0.2)

In [None]:
# use nutrition columns as features
X_COLS = list(data.columns[3:])

# use classification column as target
y_COL = data.columns[1]

print(f"X_COLS: {X_COLS}")
print(f"y_COL: {y_COL}")

In [None]:
# create design matrix X and predictions y
X_train = train[X_COLS]
y_train = train[y_COL]
X_test = test[X_COLS]
y_test = test[y_COL]

# sample some rows of training set and their labels
print(f"X_train: {X_train.head()}")
print("Labels:", y_train.iloc[:5].reset_index(drop=True).values)

# sample some rows of test set and their labels
print(f"X_test: {X_test.head()}")
print("Labels:", y_test.iloc[:5].reset_index(drop=True).values)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# create knn with 3 neighbors
knn = KNeighborsClassifier(n_neighbors=3)

# replace NaN values with 0
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)
y_train.fillna(0, inplace=True)
y_test.fillna(0, inplace=True)

# train the model using the training set
knn.fit(X_train, y_train)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# predict the labels of the test set
y_pred = knn.predict(X_test)

# print the predictions
print(f"First 5 predictions: {y_pred[:5]}")

In [None]:
# check the accuracy
from sklearn.metrics import accuracy_score

# y_test constains the true labels of the test set
# y_pred contains the predicted labels of the test set
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

In [None]:
# show confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns

# create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# create a confusion matrix using crosstab
cm2 = pd.crosstab(y_test, y_pred)
#print(cm2)

# create confusion matrix display
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp2 = ConfusionMatrixDisplay(confusion_matrix=cm2)

# plot confusion matrix
disp.plot()
plt.title("Confusion Matrix")
plt.show()

# plot the confusion matrix using seaborn
sns.heatmap(cm2, annot=True, cmap='Blues')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

# both these plots are cursed mfs

# Linear Regression

In [None]:
import random
import seaborn as sns
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(data, test_size=0.2)

# sample some rows of training set
X_train.sample(10)

In [None]:
X_COLS = list(data.columns[5:])
y_COL = data.columns[4]

print(f"X_COLS: {X_COLS}")
print(f"y_COL: {y_COL}")

# create design matrix X and predictions y
X_train = train[X_COLS]
y_train = train[y_COL]
X_test = test[X_COLS]
y_test = test[y_COL]

# sample some rows of training set and their labels
X_train.head()
y_train.head()

In [None]:
# create and fit linear model
lm = LinearRegression()

# replace NaN values with 0
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)
y_train.fillna(0, inplace=True)
y_test.fillna(0, inplace=True)

lm.fit(X_train, y_train)

In [None]:
# get intercepts and coefficients
print(f"Intercept: {lm.intercept_}")
print(f"Coefficients: {lm.coef_}")

# get the MSE
y_pred = lm.predict(X_test)
print(f"MSE: {mean_squared_error(y_test, y_pred)}")

In [None]:
# visualise
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.show()