# B. Prepare data (build data preparation pipeline and training pipeline)

### Install packages / import namespaces

In [1]:
#r "nuget:Microsoft.ML"
using XPlot.Plotly;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;

### Load data models from Models.cs file 

In [None]:
#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Models.cs"

### Initialize the ML context (we need it for building the data and training pipelines)  

In [None]:
MLContext mlContext = new MLContext(0);

### Load data from csv file into a dataview

In [None]:
const string DATASET_PATH = "./sensors_data.csv";
IDataView data = mlContext.Data.LoadFromTextFile<ModelInput>(
    path: DATASET_PATH,
    hasHeader: true,
    separatorChar: ',');

### Shuffle and split data to _trainingData_ and _testingData_ by a fraction of 0.2

In [None]:
var shuffledData = mlContext.Data.ShuffleRows(data, seed: 0);
var split = mlContext.Data.TrainTestSplit(shuffledData, testFraction: 0.2);
var trainingData = split.TrainSet;
var testingData = split.TestSet;


### Convert data to collection

In [None]:
// rename features to smth more appropriate
var features = mlContext.Data.CreateEnumerable<ModelInput>(trainingData, true);

display(features.Take(10));

### _Source_ feature is the label of the observation. Let's see the histogram of the categories.

In [None]:
var sources = features.Select(f => f.Source);
var categoriesHistogram = Chart.Plot(
    new Graph.Histogram 
    {
        x = sources
    }
);

var layout = new Layout.Layout()
{
    title = "Categories histogram"
};
categoriesHistogram.WithLayout(layout);
display(categoriesHistogram);

### Plot diagram is a rich diagram showing count, quartiles, min, max and mean of data

In [None]:
var temperatures = features.Select(f => f.Temperature);
var luminosities = features.Select(f => f.Luminosity);
var infrareds = features.Select(f => f.Infrared);
var distances = features.Select(f => f.Distance);
var hours = features.Select(f => DateTime.Parse(f.CreatedAt).Hour);
var days = features.Select(f => DateTime.Parse(f.CreatedAt).DayOfYear);

In [None]:
var categoriesDiagram = Chart.Plot(new[] {
    new Graph.Box { y = temperatures, name = "Temperature" },
    new Graph.Box { y = luminosities, name = "Luminosity" },
    new Graph.Box { y = infrareds, name = "Infrared" },
    new Graph.Box { y = distances, name = "Distance" },
    new Graph.Box { y = hours, name = "Hour of Day" },
    new Graph.Box { y = days, name = "Day of Year" }
});

var layout = new Layout.Layout()
{
    title = "Segmentation box plot"
};
categoriesDiagram.WithLayout(layout);
display(categoriesDiagram);

---
### Correlation matrix show how correlated the features are

In [None]:
#r "nuget:MathNet.Numerics"

In [None]:
#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Helpers.cs"

In [None]:
var featureColumns = new string[] { "Temperature", "Luminosity", "Infrared", "Distance", "Hour", "Day" };

var correlationMatrix = new List<List<double>>();

correlationMatrix.Add(temperatures.Select(Convert.ToDouble).ToList());
correlationMatrix.Add(luminosities.Select(Convert.ToDouble).ToList());
correlationMatrix.Add(infrareds.Select(Convert.ToDouble).ToList());
correlationMatrix.Add(distances.Select(Convert.ToDouble).ToList());
correlationMatrix.Add(hours.Select(Convert.ToDouble).ToList());
correlationMatrix.Add(days.Select(Convert.ToDouble).ToList());

var correlationMatrixHeatmap = Chart.Plot(
    new Graph.Heatmap 
    {
        x = featureColumns,
        y = featureColumns.Reverse(),
        z = Helpers.GetZAxis(correlationMatrix),
        zmin = -1,
        zmax = 1
    }
);

var layout = new Layout.Layout()
{
    autosize = "true", 
    margin =  new Graph.Margin{ l = 90 }, // fix left margin to accomodate longer labels
    title = "Correlation Matrix"
};
correlationMatrixHeatmap.WithLayout(layout);
display(correlationMatrixHeatmap);

### Pre-processing pipeline
> Map value (string) to key (number)   
> Custom mapping  
> Concatenate features  
> Normalize features  
> Drop unused columns 

In [None]:
var featureColumns = new string[] { "Temperature", "Luminosity", "Infrared", "Distance", "Hour", "Day" };

var preprocessingPipeline = mlContext.Transforms.Conversion.MapValueToKey("Label")
    .Append(mlContext.Transforms.CustomMapping<CustomInputRow, CustomOutputRow>
        (CustomMappings.IncomeMapping, nameof(CustomMappings.IncomeMapping)))
    .Append(mlContext.Transforms.Concatenate("Features", featureColumns))
    .Append(mlContext.Transforms.NormalizeMinMax("Features"));

### Model builder pipeline

In [None]:
var modelPipeline = preprocessingPipeline
    .Append(mlContext.MulticlassClassification.Trainers.SdcaNonCalibrated("Label", "Features"));

### Post-processing pipeline

In [None]:
var postprocessingPipeline = modelPipeline
    .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));

### Train the model

In [None]:
var model = postprocessingPipeline.Fit(trainingData);

### Plot segmentation for normalized data

In [None]:
var normalizedData = preprocessingPipeline.Fit(trainingData).Transform(trainingData);
var normalizedFeatures = normalizedData.GetColumn<float[]>("Features").ToArray();

var normalizedTemperatures = normalizedFeatures.Select(f => f[0]);
var normalizedLuminosities = normalizedFeatures.Select(f => f[1]);
var normalizedInfrareds = normalizedFeatures.Select(f => f[2]);
var normalizedDistances = normalizedFeatures.Select(f => f[3]);
var normalizedHours = normalizedFeatures.Select(f => f[4]);
var normalizedDays = normalizedFeatures.Select(f => f[5]);

var histogramNormalizedFeatures = Chart.Plot(new[] {
    new Graph.Box { y = normalizedTemperatures, name = "Temperature" },
    new Graph.Box { y = normalizedLuminosities, name = "Luminosity" },
    new Graph.Box { y = normalizedInfrareds, name = "Infrared" },
    new Graph.Box { y = normalizedDistances, name = "Distance" },
    new Graph.Box { y = normalizedHours, name = "Hour of Day" },
    new Graph.Box { y = normalizedDays, name = "Day of Year" }
});

display(histogramNormalizedFeatures);

### Cross validate

In [None]:
#r "nuget:Microsoft.Data.Analysis"
using Microsoft.AspNetCore.Html;
using Microsoft.Data.Analysis;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;
using System.Collections.Generic;
using static Microsoft.ML.TrainCatalogBase;

### Loads Confusion Matrix Formatter

In [None]:
#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Formatters.cs"
var categories = new string[] { "FlashLight", "Infrared", "Day", "Lighter" };
Formatters.Load(categories);

In [None]:
var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(trainingData, postprocessingPipeline, numberOfFolds: 5, labelColumnName: "Label");
crossValidationResults.ToList()

### Evaluate the model against the testing data (measure the model performance)

In [None]:
var predictions = model.Transform(testingData);
var metrics = mlContext.MulticlassClassification.Evaluate(predictions, "Label", "Score", "PredictedLabel");

In [None]:
metrics

### Save Trained Model

In [None]:
mlContext.Model.Save(model, trainingData.Schema, "model.zip");