# PREPARE DATA
(build data preparation pipeline and training pipeline)

In [1]:
#r "nuget:Microsoft.ML,1.5.2"
using XPlot.Plotly;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;

MLContext mlContext = new MLContext(seed: 123);

Installed package Microsoft.ML version 1.5.2

### Load data models from Models.csx file 

In [1]:
#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Models.csx"

### Load data from csv file into a dataview

In [1]:
const string DATASET_PATH = "./sensors_data.csv";
IDataView data = mlContext.Data.LoadFromTextFile<ModelInput>(
    path: DATASET_PATH,
    hasHeader: true,
    separatorChar: ',');

### Shuffle and split data to _trainingData_ and _testingData_ by a fraction of 0.2

In [1]:
var shuffledData = mlContext.Data.ShuffleRows(data, seed: 1);
var split = mlContext.Data.TrainTestSplit(shuffledData, testFraction: 0.2);
var trainingData = split.TrainSet;
var testingData = split.TestSet;

### Selected features for building the model 

In [1]:
var featureColumns = new[] { "Temperature", "Luminosity", "Infrared", "Distance", "Hour", "Day" };

### Build the pre-processing pipeline
> Map value (string) to key (number)  
> Custom mapping (extract HourOfDay and DayofYear from CreatedAt feature)  
> Concatenate features  
> Normalize features  

In [1]:
var preprocessingPipeline = mlContext.Transforms.Conversion.MapValueToKey("Label")
    .Append(mlContext.Transforms.CustomMapping<CustomInputRow, CustomOutputRow>
        (CustomMappings.IncomeMapping, nameof(CustomMappings.IncomeMapping)))
    .Append(mlContext.Transforms.Concatenate("Features", featureColumns))
    .Append(mlContext.Transforms.NormalizeMinMax("Features"));

### Box plot segmentation (for normalized data!)

In [1]:
var normalizedData = preprocessingPipeline.Fit(trainingData).Transform(trainingData);
var normalizedFeatures = normalizedData.GetColumn<float[]>("Features").ToArray();

var normalizedTemperatures = normalizedFeatures.Select(f => f[0]);
var normalizedLuminosities = normalizedFeatures.Select(f => f[1]);
var normalizedInfrareds = normalizedFeatures.Select(f => f[2]);
var normalizedDistances = normalizedFeatures.Select(f => f[3]);
var normalizedHours = normalizedFeatures.Select(f => f[4]);
var normalizedDays = normalizedFeatures.Select(f => f[5]);

In [1]:
var segmentationNormalizedFeatures = Chart.Plot(new[] {
    new Graph.Box { y = normalizedTemperatures, name = "Temperature" },
    new Graph.Box { y = normalizedLuminosities, name = "Luminosity" },
    new Graph.Box { y = normalizedInfrareds, name = "Infrared" },
    new Graph.Box { y = normalizedDistances, name = "Distance" },
    new Graph.Box { y = normalizedHours, name = "Hour of Day" },
    new Graph.Box { y = normalizedDays, name = "Day of Year" }
});

var layout = new Layout.Layout()
{
    title = "Box plot segmentation"
};
segmentationNormalizedFeatures.WithLayout(layout);

display(segmentationNormalizedFeatures);

### Build the training pipeline

In [1]:
var trainingPipeline = preprocessingPipeline
    .Append(mlContext.MulticlassClassification.Trainers.SdcaNonCalibrated("Label", "Features"));

### Build the post-processing pipeline
> Map key (number) to value (string)  

In [1]:
var postprocessingPipeline = trainingPipeline
    .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));

## 1. Validate Model

In [1]:
#r "nuget:Microsoft.Data.Analysis"
using Microsoft.AspNetCore.Html;
using Microsoft.Data.Analysis;

Installed package Microsoft.Data.Analysis version 0.4.0

### Loads Confusion Matrix Formatter (from csx, library or nuget, see Formatters.csx)

In [1]:
//#r "nuget:ApexCode.Interactive.Formatting,0.0.1-beta.5"
//using ApexCode.Interactive.Formatting;

#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Formatters.csx"
Formatters.Categories = new string[] { "FlashLight", "Infrared", "Day", "Lighter" };
Formatters.Register<DataFrame>();
Formatters.Register<List<TrainCatalogBase.CrossValidationResult<MulticlassClassificationMetrics>>>();
Formatters.Register<MulticlassClassificationMetrics>();

In [1]:
var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(trainingData, postprocessingPipeline, numberOfFolds: 5, labelColumnName: "Label");
display(crossValidationResults.ToList())

CROSS-VALIDATION: multi-class classification,Average,Standard deviation,Confidence interval (95%)
MacroAccuracy,0.965,0.017,0.017
MicroAccuracy,0.964,0.012,0.012
LogLoss,2.484,2.863,2.805
LogLossReduction,-0.878,2.163,2.12


### Permuation Feature Importance (PFI)

In [1]:
var modelForContributions = trainingPipeline.Fit(trainingData);
var transformedData = modelForContributions.Transform(testingData); // never do the PFI on training data!
var linearPredictor = modelForContributions.LastTransformer;

In [1]:
var pfi = mlContext.MulticlassClassification.PermutationFeatureImportance(predictionTransformer: linearPredictor, data: transformedData, permutationCount: 5);
var sortedMetrics = pfi
    .Select((metrics, index) => new { index, metrics.MicroAccuracy })
    .OrderByDescending(feature => Math.Abs(feature.MicroAccuracy.Mean))
    .Select(feature => feature.index);

In [1]:
var pfiDiagram = Chart.Plot(new Graph.Bar
    {
        x = sortedMetrics,
        y = featureColumns.Reverse(), 
        orientation = "h"
    });
var layout = new Layout.Layout()
{
    title = "Permuation Feature Importance (PFI)"
};
pfiDiagram.WithLayout(layout);
display(pfiDiagram);

## 2. Evaluate Model

### Train the model

In [1]:
var model = postprocessingPipeline.Fit(trainingData);

### Evaluate the model

In [1]:
var predictions = model.Transform(testingData);
var metrics = mlContext.MulticlassClassification.Evaluate(predictions, "Label", "Score", "PredictedLabel");

In [1]:
Formatters.Register<MulticlassClassificationMetrics>();
metrics

EVALUATION: multi-class classification,Class,Value,Note
MacroAccuracy,,0.974,"the closer to 1, the better"
MicroAccuracy,,0.982,"the closer to 1, the better"
LogLoss,,24.022,"the closer to 0, the better"
LogLoss per Class,,,
LogLoss per Class,FlashLight,30.761,"the closer to 0, the better"
LogLoss per Class,Infrared,17.907,"the closer to 0, the better"
LogLoss per Class,Day,32.94,"the closer to 0, the better"
LogLoss per Class,Lighter,0.0,"the closer to 0, the better"


### Confusion matrix

In [1]:
Formatters.Register<ConfusionMatrix>();
display(metrics.ConfusionMatrix);


0,1,2,3,4,5,6
Confusion Matrix,Confusion Matrix,Predicted,Predicted,Predicted,Predicted,
Confusion Matrix,Confusion Matrix,FlashLight,Infrared,Day,Lighter,Recall
Truth,FlashLight,45,0,0,0,1
Truth,Infrared,3,26,0,0,0.8966
Truth,Day,0,0,64,0,1
Truth,Lighter,0,0,0,29,1
Precision,Precision,0.9375,1,1,1,total = 167


### Save the trained model

In [1]:
mlContext.Model.Save(model, trainingData.Schema, "model.zip");