# B. Prepare data (build data preparation pipeline and training pipeline)

### Install packages / import namespaces

In [1]:
#r "nuget:Microsoft.ML"
using XPlot.Plotly;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;

### Load data models from Models.cs file 

In [2]:
#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Models.cs"

### Initialize the ML context (we need it for building the data and training pipelines)  

In [3]:
MLContext mlContext = new MLContext(0);

### Load data from csv file into a dataview

In [4]:
const string DATASET_PATH = "./sensors_data.csv";
IDataView data = mlContext.Data.LoadFromTextFile<ModelInput>(
    path: DATASET_PATH,
    hasHeader: true,
    separatorChar: ',');

### Shuffle and split data to _trainingData_ and _testingData_ by a fraction of 0.2

In [5]:
var shuffledData = mlContext.Data.ShuffleRows(data, seed: 0);
var split = mlContext.Data.TrainTestSplit(shuffledData, testFraction: 0.2);
var trainingData = split.TrainSet;
var testingData = split.TestSet;


### Convert data to collection

In [6]:
// rename features to smth more appropriate
var features = mlContext.Data.CreateEnumerable<ModelInput>(trainingData, true);

display(features.Take(10));

index,Temperature,Luminosity,Infrared,Distance,CreatedAt,Source
0,24.82,50.49,0.0,8.84,01/03/2020 18:22:56,FlashLight
1,23.96,4.1,0.0,154.09,06/03/2020 21:31:55,Day
2,32.5,72.27,0.0,66.87,05/03/2020 11:29:22,FlashLight
3,47.32,100.0,94.34,41.55,04/03/2020 9:28:24,Lighter
4,25.09,12.11,0.0,400.0,04/03/2020 10:26:18,Day
5,23.28,45.51,0.0,12.08,02/03/2020 10:23:51,FlashLight
6,53.12,100.0,93.95,400.0,06/03/2020 21:32:21,Lighter
7,27.34,70.31,0.0,194.26,03/03/2020 19:24:48,FlashLight
8,24.77,18.55,0.0,400.0,03/03/2020 18:25:00,Day
9,25.09,12.79,0.0,88.81,04/03/2020 7:27:19,Day


### _Source_ feature is the label of the observation. Let's see the histogram of the categories.

In [7]:
var sources = features.Select(f => f.Source);
var categoriesHistogram = Chart.Plot(
    new Graph.Histogram 
    {
        x = sources
    }
);

var layout = new Layout.Layout()
{
    title = "Categories histogram"
};
categoriesHistogram.WithLayout(layout);
display(categoriesHistogram);

### Plot diagram is a rich diagram showing count, quartiles, min, max and mean of data

In [8]:
var temperatures = features.Select(f => f.Temperature);
var luminosities = features.Select(f => f.Luminosity);
var infrareds = features.Select(f => f.Infrared);
var distances = features.Select(f => f.Distance);
var hours = features.Select(f => DateTime.Parse(f.CreatedAt).Hour);
var days = features.Select(f => DateTime.Parse(f.CreatedAt).DayOfYear);

In [9]:
var categoriesDiagram = Chart.Plot(new[] {
    new Graph.Box { y = temperatures, name = "Temperature" },
    new Graph.Box { y = luminosities, name = "Luminosity" },
    new Graph.Box { y = infrareds, name = "Infrared" },
    new Graph.Box { y = distances, name = "Distance" },
    new Graph.Box { y = hours, name = "Hour of Day" },
    new Graph.Box { y = days, name = "Day of Year" }
});

var layout = new Layout.Layout()
{
    title = "Segmentation box plot"
};
categoriesDiagram.WithLayout(layout);
display(categoriesDiagram);

---
### Correlation matrix show how correlated the features are

In [10]:
#r "nuget:MathNet.Numerics"

In [11]:
#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Helpers.cs"

In [12]:
var featureColumns = new string[] { "Temperature", "Luminosity", "Infrared", "Distance", "Hour", "Day" };

var correlationMatrix = new List<List<double>>();

correlationMatrix.Add(temperatures.Select(Convert.ToDouble).ToList());
correlationMatrix.Add(luminosities.Select(Convert.ToDouble).ToList());
correlationMatrix.Add(infrareds.Select(Convert.ToDouble).ToList());
correlationMatrix.Add(distances.Select(Convert.ToDouble).ToList());
correlationMatrix.Add(hours.Select(Convert.ToDouble).ToList());
correlationMatrix.Add(days.Select(Convert.ToDouble).ToList());

var correlationMatrixHeatmap = Chart.Plot(
    new Graph.Heatmap 
    {
        x = featureColumns,
        y = featureColumns.Reverse(),
        z = Helpers.GetZAxis(correlationMatrix),
        zmin = -1,
        zmax = 1
    }
);

var layout = new Layout.Layout()
{
    autosize = "true", 
    margin =  new Graph.Margin{ l = 90 }, // fix left margin to accomodate longer labels
    title = "Correlation Matrix"
};
correlationMatrixHeatmap.WithLayout(layout);
display(correlationMatrixHeatmap);

### Pre-processing pipeline
> Map value (string) to key (number)   
> Custom mapping  
> Concatenate features  
> Normalize features  
> Drop unused columns 

In [13]:
var featureColumns = new string[] { "Temperature", "Luminosity", "Infrared", "Distance", "Hour", "Day" };

var preprocessingPipeline = mlContext.Transforms.Conversion.MapValueToKey("Label")
    .Append(mlContext.Transforms.CustomMapping<CustomInputRow, CustomOutputRow>
        (CustomMappings.IncomeMapping, nameof(CustomMappings.IncomeMapping)))
    .Append(mlContext.Transforms.Concatenate("Features", featureColumns))
    .Append(mlContext.Transforms.NormalizeMinMax("Features"));

### Model builder pipeline

In [14]:
var modelPipeline = preprocessingPipeline
    .Append(mlContext.MulticlassClassification.Trainers.SdcaNonCalibrated("Label", "Features"));

### Post-processing pipeline

In [15]:
var postprocessingPipeline = modelPipeline
    .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));

### Train the model

In [16]:
var model = postprocessingPipeline.Fit(trainingData);

### Plot segmentation for normalized data

In [17]:
var normalizedData = preprocessingPipeline.Fit(trainingData).Transform(trainingData);
var normalizedFeatures = normalizedData.GetColumn<float[]>("Features").ToArray();

var normalizedTemperatures = normalizedFeatures.Select(f => f[0]);
var normalizedLuminosities = normalizedFeatures.Select(f => f[1]);
var normalizedInfrareds = normalizedFeatures.Select(f => f[2]);
var normalizedDistances = normalizedFeatures.Select(f => f[3]);
var normalizedHours = normalizedFeatures.Select(f => f[4]);
var normalizedDays = normalizedFeatures.Select(f => f[5]);

var histogramNormalizedFeatures = Chart.Plot(new[] {
    new Graph.Box { y = normalizedTemperatures, name = "Temperature" },
    new Graph.Box { y = normalizedLuminosities, name = "Luminosity" },
    new Graph.Box { y = normalizedInfrareds, name = "Infrared" },
    new Graph.Box { y = normalizedDistances, name = "Distance" },
    new Graph.Box { y = normalizedHours, name = "Hour of Day" },
    new Graph.Box { y = normalizedDays, name = "Day of Year" }
});

display(histogramNormalizedFeatures);

### Cross validate

In [18]:
#r "nuget:Microsoft.Data.Analysis"
using Microsoft.AspNetCore.Html;
using Microsoft.Data.Analysis;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;
using System.Collections.Generic;
using static Microsoft.ML.TrainCatalogBase;

### Loads Confusion Matrix Formatter

In [19]:
#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Formatters.cs"
var categories = new string[] { "FlashLight", "Infrared", "Day", "Lighter" };
Formatters.Load(categories);

In [20]:
var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(trainingData, postprocessingPipeline, numberOfFolds: 5, labelColumnName: "Label");
crossValidationResults.ToList()

CROSS-VALIDATION: multi-class classification,Average,Standard deviation,Confidence interval (95%)
MacroAccuracy,0.961,0.028,0.027
MicroAccuracy,0.966,0.022,0.022
LogLoss,8.244,10.131,9.929
LogLossReduction,-5.248,7.671,7.517


### Evaluate the model against the testing data (measure the model performance)

In [21]:
var predictions = model.Transform(testingData);
var metrics = mlContext.MulticlassClassification.Evaluate(predictions, "Label", "Score", "PredictedLabel");

In [22]:
metrics

EVALUATION: multi-class classification,Class,Value,Note
MacroAccuracy,,0.968,"the closer to 1, the better"
MicroAccuracy,,0.96,"the closer to 1, the better"
LogLoss,,2.393,"the closer to 0, the better"
LogLoss per Class,,,
LogLoss per Class,FlashLight,5.099,"the closer to 0, the better"
LogLoss per Class,Infrared,0.07,"the closer to 0, the better"
LogLoss per Class,Day,0.0,"the closer to 0, the better"
LogLoss per Class,Lighter,2.616,"the closer to 0, the better"


### Save Trained Model

In [23]:
mlContext.Model.Save(model, trainingData.Schema, "model.zip");