# PREPARE DATA
### (build data preparation pipeline and training pipeline)

In [1]:
#r "nuget:Microsoft.ML,1.5.1"
using XPlot.Plotly;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;

Installed package Microsoft.ML version 1.5.1

#### Load data models from Models.csx file 

In [2]:
#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Models.csx"

#### Initialize the ML context (we need it for building the data and training pipelines)  

In [3]:
MLContext mlContext = new MLContext(seed: 1);

#### Load data from csv file into a dataview

In [4]:
const string DATASET_PATH = "./sensors_data.csv";
IDataView data = mlContext.Data.LoadFromTextFile<ModelInput>(
    path: DATASET_PATH,
    hasHeader: true,
    separatorChar: ',');

#### Shuffle and split data to _trainingData_ and _testingData_ by a fraction of 0.2

In [5]:
var shuffledData = mlContext.Data.ShuffleRows(data, seed: 1);
var split = mlContext.Data.TrainTestSplit(shuffledData, testFraction: 0.2);
var trainingData = split.TrainSet;
var testingData = split.TestSet;

#### Selected features for building the model 

In [6]:
var featureColumns = new[] { "Temperature", "Luminosity", "Infrared", "Distance", "Hour", "Day" };

#### Build the pre-processing pipeline
> Map value (string) to key (number)  
> Custom mapping (extract HourOfDay and DayofYear from CreatedAt feature)  
> Concatenate features  
> Normalize features  

In [7]:
var preprocessingPipeline = mlContext.Transforms.Conversion.MapValueToKey("Label")
    .Append(mlContext.Transforms.CustomMapping<CustomInputRow, CustomOutputRow>
        (CustomMappings.IncomeMapping, nameof(CustomMappings.IncomeMapping)))
    .Append(mlContext.Transforms.Concatenate("Features", featureColumns))
    .Append(mlContext.Transforms.NormalizeMinMax("Features"));





#### Box plot segmentation (for normalized data!)

In [8]:
var normalizedData = preprocessingPipeline.Fit(trainingData).Transform(trainingData);
var normalizedFeatures = normalizedData.GetColumn<float[]>("Features").ToArray();

var normalizedTemperatures = normalizedFeatures.Select(f => f[0]);
var normalizedLuminosities = normalizedFeatures.Select(f => f[1]);
var normalizedInfrareds = normalizedFeatures.Select(f => f[2]);
var normalizedDistances = normalizedFeatures.Select(f => f[3]);
var normalizedHours = normalizedFeatures.Select(f => f[4]);
var normalizedDays = normalizedFeatures.Select(f => f[5]);

In [9]:
var segmentationNormalizedFeatures = Chart.Plot(new[] {
    new Graph.Box { y = normalizedTemperatures, name = "Temperature" },
    new Graph.Box { y = normalizedLuminosities, name = "Luminosity" },
    new Graph.Box { y = normalizedInfrareds, name = "Infrared" },
    new Graph.Box { y = normalizedDistances, name = "Distance" },
    new Graph.Box { y = normalizedHours, name = "Hour of Day" },
    new Graph.Box { y = normalizedDays, name = "Day of Year" }
});

var layout = new Layout.Layout()
{
    title = "Box plot segmentation"
};
segmentationNormalizedFeatures.WithLayout(layout);

display(segmentationNormalizedFeatures);

#### Build the training pipeline

In [10]:
var trainingPipeline = preprocessingPipeline
    .Append(mlContext.MulticlassClassification.Trainers.SdcaNonCalibrated("Label", "Features"));

#### Build the post-processing pipeline
> Map key (number) to value (string)  

In [11]:
var postprocessingPipeline = trainingPipeline
    .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));

# 1. Validate Model

In [12]:
#r "nuget:Microsoft.Data.Analysis"
using Microsoft.AspNetCore.Html;
using Microsoft.Data.Analysis;


Installed package Microsoft.Data.Analysis version 0.4.0

#### Loads Confusion Matrix Formatter (from csx, library or nuget, see Formatters.csx)

In [13]:
#r "nuget:ApexCode.Interactive.Formatting,0.0.1-beta.5"
using ApexCode.Interactive.Formatting;

Installed package ApexCode.Interactive.Formatting version 0.0.1-beta.5

In [14]:
Formatters.Categories = new string[] { "FlashLight", "Infrared", "Day", "Lighter" };

Formatters.Register<List<TrainCatalogBase.CrossValidationResult<MulticlassClassificationMetrics>>>();
Formatters.Register<MulticlassClassificationMetrics>();

List<TrainCatalogBase.CrossValidationResult<MulticlassClassificationMetrics>> formatter loaded.
MulticlassClassificationMetrics formatter loaded.


In [15]:
Formatters.Categories

index,value
0,FlashLight
1,Infrared
2,Day
3,Lighter


In [17]:
var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(trainingData, postprocessingPipeline, numberOfFolds: 5, labelColumnName: "Label");
display(crossValidationResults.ToList())

CROSS-VALIDATION: multi-class classification,Average,Standard deviation,Confidence interval (95%)
MacroAccuracy,0.963,0.025,0.025
MicroAccuracy,0.961,0.02,0.02
LogLoss,7.515,8.635,8.462
LogLossReduction,-4.702,6.498,6.368


#### Permuation Feature Importance (PFI)

In [18]:
var modelForContributions = trainingPipeline.Fit(trainingData);
var transformedData = modelForContributions.Transform(testingData); // never do the PFI on training data!
var linearPredictor = modelForContributions.LastTransformer;

In [19]:
var pfi = mlContext.MulticlassClassification.PermutationFeatureImportance(predictionTransformer: linearPredictor, data: transformedData, permutationCount: 3);
var sortedMetrics = pfi.Select((metrics, index) => new { index, metrics.MacroAccuracy })
    .OrderBy(feature => Math.Abs(feature.MacroAccuracy.Mean)).Select(feature => feature.MacroAccuracy.Mean);


(1,46): error CS0411: The type arguments for method 'PermutationFeatureImportanceExtensions.PermutationFeatureImportance<TModel>(RegressionCatalog, ISingleFeaturePredictionTransformer<TModel>, IDataView, string, bool, int?, int)' cannot be inferred from the usage. Try specifying the type arguments explicitly.



Cell not executed: compilation error

In [20]:
var pfiDiagram = Chart.Plot(new Graph.Bar
    {
        x = sortedMetrics,
        y = featureColumns.Reverse(), 
        orientation = "h"
    });
var layout = new Layout.Layout()
{
    title = "Permuation Feature Importance (PFI)"
};
pfiDiagram.WithLayout(layout);
display(pfiDiagram);


(3,13): error CS0103: The name 'sortedMetrics' does not exist in the current context



Cell not executed: compilation error

# 2. Evaluate Model

#### Train the model

In [21]:
var model = postprocessingPipeline.Fit(trainingData);

#### Evaluate the model

In [22]:
var predictions = model.Transform(testingData);
var metrics = mlContext.MulticlassClassification.Evaluate(predictions, "Label", "Score", "PredictedLabel");

In [24]:
var categories = new string[] { "FlashLight", "Infrared", "Day", "Lighter" };
Formatters.Register<MulticlassClassificationMetrics>(categories);
metrics

MulticlassClassificationMetrics formatter loaded.


EVALUATION: multi-class classification,Class,Value,Note
MacroAccuracy,,0.931,"the closer to 1, the better"
MicroAccuracy,,0.942,"the closer to 1, the better"
LogLoss,,21.489,"the closer to 0, the better"
LogLoss per Class,,,
LogLoss per Class,FlashLight,29.311,"the closer to 0, the better"
LogLoss per Class,Infrared,27.21,"the closer to 0, the better"
LogLoss per Class,Day,13.35,"the closer to 0, the better"
LogLoss per Class,Lighter,1.191,"the closer to 0, the better"


#### Confusion matrix

In [25]:
//Formatters.Register<ConfusionMatrix>(categories);
//display(metrics.ConfusionMatrix);

Formatters.Register<ConfusionMatrixDisplayView>();
display(metrics.ConfusionMatrix.AddCategories(categories));


(5,9): error CS1929: 'ConfusionMatrix' does not contain a definition for 'AddCategories' and the best extension method overload 'JupyterExtensions.AddCategories(MulticlassClassificationMetrics, string[])' requires a receiver of type 'MulticlassClassificationMetrics'



Cell not executed: compilation error

#### Save the trained model

In [None]:
mlContext.Model.Save(model, trainingData.Schema, "model.zip");