In [1]:
# Background
"""I found this code on the google developer ML crash course
website and I really liked how they used the data classes
to organize the ML training so I adapted their architecture here"""

'I found this code on the google developer ML crash course\nwebsite and I really liked how they used the data classes\nto organize the ML training so I adapted their architecture here'

In [2]:
# Load the imports

import keras
import numpy as np
import pandas as pd

from dataclasses import dataclass

import matplotlib.pyplot as plt

In [3]:
# Load the dataset
rice_dataset_raw = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/Rice_Cammeo_Osmancik.csv")

In [4]:
# Read and provide statistics on the dataset.
rice_dataset = rice_dataset_raw[[
    'Area',
    'Perimeter',
    'Major_Axis_Length',
    'Minor_Axis_Length',
    'Eccentricity',
    'Convex_Area',
    'Extent',
    'Class',
]]

rice_dataset.describe()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent
count,3810.0,3810.0,3810.0,3810.0,3810.0,3810.0,3810.0
mean,12667.727559,454.23918,188.776222,86.31375,0.886871,12952.49685,0.661934
std,1732.367706,35.597081,17.448679,5.729817,0.020818,1776.972042,0.077239
min,7551.0,359.100006,145.264465,59.532406,0.777233,7723.0,0.497413
25%,11370.5,426.144753,174.353855,82.731695,0.872402,11626.25,0.598862
50%,12421.5,448.852493,185.810059,86.434647,0.88905,12706.5,0.645361
75%,13950.0,483.683746,203.550438,90.143677,0.902588,14284.0,0.726562
max,18913.0,548.445984,239.010498,107.54245,0.948007,19099.0,0.86105


In [5]:
# Normalize and transform the data

feature_mean = rice_dataset.mean(numeric_only=True)
feature_std = rice_dataset.std(numeric_only=True)
cols_of_numerical_features = rice_dataset.select_dtypes('number').columns

rice_dataset_normalized = (
    rice_dataset[cols_of_numerical_features] - feature_mean
    ) / feature_std

#copy the label to the new dataframe
rice_dataset_normalized['Class'] = rice_dataset['Class']

#add a column of binary labels representing the class
rice_dataset_normalized['Class_Bool'] = (
    #returns True if class is Cammeo and false ow
    rice_dataset_normalized['Class']=='Cammeo'
).astype(int)

rice_dataset_normalized.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class,Class_Bool
0,1.479635,2.004091,2.348238,-0.212915,2.018073,1.499463,-1.15277,Cammeo,1
1,1.14772,1.125705,0.988261,0.945444,0.409964,1.192761,-0.602,Cammeo,1
2,1.13502,1.317041,1.451718,0.253854,1.212797,1.126356,0.405558,Cammeo,1
3,0.293398,0.115285,0.261405,0.198025,0.23972,0.233826,-0.275315,Cammeo,1
4,1.166191,1.486858,1.316269,0.523351,0.952096,1.299685,-0.205986,Cammeo,1


In [6]:
# Create train, test, and validation datasets

#set the random seed
keras.utils.set_random_seed(42)

#we'll use an 80-10-10 split
num_samples = len(rice_dataset_normalized)
idx_80th = round(num_samples * 0.8)
idx_90th = idx_80th + round(num_samples * 0.1)

#shuffle the dataset randomly
data_shuffled = rice_dataset_normalized.sample(frac=1, random_state=100)

train_data = data_shuffled.iloc[0:idx_80th]
test_data = data_shuffled.iloc[idx_80th:idx_90th]
validation_data = data_shuffled.iloc[idx_90th:]

train_data.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class,Class_Bool
2030,-2.076769,-2.223362,-1.821991,-2.109025,-0.274986,-2.093166,0.977027,Osmancik,0
2225,-0.58286,-0.308402,-0.513622,-0.425379,-0.111876,-0.522516,-0.891168,Osmancik,0
1053,0.694583,1.189419,1.363689,-0.44321,1.546616,0.783075,-1.736411,Cammeo,1
3685,-0.188602,-0.087737,-0.294041,0.066788,-0.255445,-0.1556,-0.774437,Osmancik,0
2976,-0.657324,-0.949044,-1.175949,0.379025,-1.744676,-0.677837,0.623452,Osmancik,0


In [7]:
# Extract the features and the labels from the train, test, and validation data

label_cols = ['Class', 'Class_Bool']

train_features = train_data.drop(columns=label_cols)
train_labels = train_data['Class_Bool'].to_numpy()
validation_features = validation_data.drop(columns=label_cols)
validation_labels = validation_data['Class_Bool'].to_numpy()
test_features = test_data.drop(columns=label_cols)
test_labels = test_data['Class_Bool'].to_numpy()


In [8]:
# Define data classes for the experiment and experiment settings

import keras.callbacks
import keras.layers


@dataclass
class ExperimentSettings:
    """Hyperparameters and list of input features used to train the model"""

    learning_rate: float
    num_epochs: int
    batch_size: int
    classification_threshold: float
    input_features: list[str]

@dataclass
class Experiment:
    """Stores the settings used for training and resulting model"""

    name: str
    settings: ExperimentSettings
    model: keras.Model
    epochs: np.ndarray
    metrics_history: keras.callbacks.History

    def get_final_metric_value(self, metric_name: str) -> float:
        """Gets the final value of the given metric for the experiment"""
        if metric_name not in self.metrics_history:
            raise ValueError(
                f'Unknown metric {metric_name}: available metrics are'
                f' {list(self.metrics_history.columns)}'
            )
        return self.metrics_history[metric_name].iloc[-1]
    
def create_model(
        settings: ExperimentSettings,
        metrics: list[keras.metrics.Metric],
) -> keras.Model:
    """Create and compile a classification model"""
    
    model_inputs = [
        keras.Input(name=feature, shape=(1,))
        for feature in settings.input_features
    ]

    #use concatenation layer to assemble different inputs into a single tensor
    # For example: [input_1[0][0], input_2[0][0]]

    concatenated_inputs = keras.layers.Concatenate()(model_inputs)
    hidden1 = keras.layers.Dense(10,activation='relu')(concatenated_inputs)
    hidden2 = keras.layers.Dense(10,activation='relu')(hidden1)
    model_output = keras.layers.Dense(1, activation='sigmoid')(hidden2)
    model = keras.Model(inputs=model_inputs, outputs=model_output)

    model.compile(
        optimizer=keras.optimizers.Adam(
            settings.learning_rate
        ),
        loss=keras.losses.BinaryCrossentropy(),
        metrics=metrics
    )
    return model

def train_model(
        experiment_name: str,
        model: keras.Model,
        train_dataset: pd.DataFrame,
        train_labels: np.ndarray,
        validation_dataset: pd.DataFrame,
        validation_labels: np.ndarray,
        settings: ExperimentSettings
) -> Experiment:
    
    # The x parameter of keras.Model.fit can be a dict of arrays, where
    # each array contains the data for one feature.
    train_features={
        feature_name: np.array(train_dataset[feature_name])
        for feature_name in settings.input_features
    }

    validation_features={
        feature_name: np.array(validation_dataset[feature_name])
        for feature_name in settings.input_features
    }

    history = model.fit(
        x=train_features,
        y=train_labels,
        batch_size=settings.batch_size,
        epochs=settings.num_epochs,
        validation_data=(validation_features, validation_labels)
    )

    return Experiment(
        name=experiment_name,
        settings=settings,
        model=model,
        epochs=history.epoch,
        metrics_history=pd.DataFrame(history.history) 
    )


In [9]:
# Build and train the model

input_features = [
    'Eccentricity',
    'Major_Axis_Length',
    'Area',
]

settings = ExperimentSettings(
    learning_rate=0.001,
    num_epochs=60,
    batch_size=100,
    classification_threshold=0.35,
    input_features=input_features,
)

metrics = [
    keras.metrics.BinaryAccuracy(
        name='accuracy', threshold=settings.classification_threshold
    ),
    keras.metrics.Precision(
        name='precision', thresholds=settings.classification_threshold
    ),
    keras.metrics.Recall(
        name='recall', thresholds=settings.classification_threshold
    ),
    keras.metrics.AUC(num_thresholds=100, name='auc'),
]

# build the model
model = create_model(settings, metrics)

#train the model on the training set

experiment = train_model(
    'baseline', model, train_features, train_labels, 
    validation_features, validation_labels, settings
)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


In [10]:
# Save the model and the test data
model.save('rice_classifier.h5')

test_features[input_features].to_csv('test_features.csv', index=False)
np.save('test_labels.npy', test_labels)