In [1]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import piplite
await piplite.install('seaborn')
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# Fruit classification challenge!

**Data**:
<br>
We provide a training dataset comprising 10’000 photoplethysmography (PPG) signals sampled
at 10 Hz (30-second duration each). Each of these 10’000 PPG recordings has a label
(fruit name) that can be used to train an ML model.

**Goal**:<br>
The idea is to :
1. Design an ML-based solution to classify these PPG signals into the different fruits and
train it using the training dataset.
2. Generate the outputs for the 10’000 recordings of the test dataset and upload the results in the shared drive by 12:30h.


Recall common steps to tackle a Machine Learning challenge:
1. Exploratory Data Analysis
    -  Check the type of data we have
    -  Check data and label distribution
    -  Visualizations (scatterplots, histograms, boxplots, ...)
2. Feature selection:
    - Assess which features to use from the data
    - Iterative process, usually trial-error
3. Model selection
    - Try different models and evaluate them via train/validation split or k-fold cross validation
    - Select best candidate or few best candidates
4. Model evaluation
    - Assess the performance of the best model on a test dataset/submit the results on the test dataset
    

Let's load the data, make sure to check you are correctly handling the headers of the datasets!

In [2]:
data_raw = pd.read_csv('quiz_train_data.csv',header=None)
fruits_raw = pd.read_csv('quiz_train_labels.csv', header=None)

data = data_raw.values
fruits = fruits_raw.values.flatten()

fruit_names = np.unique(fruits)

In [34]:
# functions to for feature generation
def normalizer_scaling(X):
    X = np.array(X)
    numerator = X - X.min(axis=1).reshape((len(X), 1))
    denominator = X.max(axis=1) - X.min(axis=1)
    denominator = denominator.reshape((len(X),1))
    X_scaled = numerator/denominator
    return X_scaled

# fft transformation
def generate_fft(X):
    n = X.shape[1]
    return np.abs(np.fft.fft(X))[:,1:n//2], np.arange(1, (n//2))/n

def generate_features(X):
    # Compute mean and std of raw signal
    X_mean = np.mean(X,axis=1)
    X_std = np.std(X,axis=1)
    X_min = np.min(X,axis=1)
    X_max = np.max(X,axis=1)
    
    
    # Normalize signal and retrieve fft
    X_scaled = normalizer_scaling(X)
    X_fft, _ = generate_fft(X_scaled)
    return np.column_stack((X_mean,X_std,X_min,X_max,X))

def report_accuracy(X, y, model, model_name):
    predictions = model.predict(X)
    accuracy = np.mean(predictions==y)
    print("%s Validation accuracy is %.3f" % (model_name, accuracy))
    

# Train-validation split 

In [4]:
X = generate_features(data)
y = fruits

In [5]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=40)
print(X_train.shape, X_val.shape)

(8000, 304) (2000, 304)


# Model training

In [36]:
# Logistic regression
model_lr = LogisticRegression(max_iter=100).fit(X_train, y_train)
# MLP
model_mlp = MLPClassifier(hidden_layer_sizes=(20,20,20), max_iter=1000).fit(X_train, y_train)
# XGBoost
model_xgb = XGBClassifier(n_estimators=20).fit(X_train, y_train,verbose=True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
report_accuracy(X_val, y_val, model_lr, 'Logistic Regression')
report_accuracy(X_val, y_val, model_mlp, 'MLP')
report_accuracy(X_val, y_val, model_xgb, 'XGBoost')

Logistic Regression Validation accuracy is 0.296
MLP Validation accuracy is 0.536
XGBoost Validation accuracy is 0.629


# Evaluate on the test set

In [38]:
test_data_raw = pd.read_csv('quiz_test_data.csv',header=None)
test_labels = pd.read_csv('quiz_test_labels.csv',header=None).values.flatten()
test_data = test_data_raw.values

In [39]:
for model in [model_lr, model_mlp, model_xgb]:
    test_predictions = model.predict(X_test)
    test_fruit_predictions = label_encoder.inverse_transform(test_predictions)
    print(np.mean(test_fruit_predictions==test_labels))

0.2928
0.5386
0.6332
