# PREPARE DATA
(build data preparation pipeline and training pipeline)

In [1]:
#r "nuget:Microsoft.ML,1.5.2"
#r "nuget:Microsoft.ML.LightGBM,1.5.2"
using XPlot.Plotly;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers.LightGbm;

MLContext mlContext = new MLContext(seed: 1);

In [1]:
#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Models.csx"

const string DATASET_PATH = "./sensors_data.csv";
IDataView data = mlContext.Data.LoadFromTextFile<ModelInput>(
    path: DATASET_PATH,
    hasHeader: true,
    separatorChar: ',');

var shuffledData = mlContext.Data.ShuffleRows(data, seed: 1);
var split = mlContext.Data.TrainTestSplit(shuffledData, testFraction: 0.3);
var trainingData = split.TrainSet;
var testingData = split.TestSet;

In [1]:
var featureColumns = new[] { "Temperature", "Luminosity", "Infrared", "Distance", "PIR", "Humidity" };

var preprocessingPipeline = mlContext.Transforms.Conversion.MapValueToKey("Label")
    .Append(mlContext.Transforms.Concatenate("Features", featureColumns))
    .Append(mlContext.Transforms.NormalizeMinMax("Features"));

### Box plot segmentation (for normalized data!)

In [1]:
var normalizedData = preprocessingPipeline.Fit(trainingData).Transform(trainingData);
var normalizedFeatures = normalizedData.GetColumn<float[]>("Features").ToArray();

var normalizedTemperatures = normalizedFeatures.Select(f => f[0]);
var normalizedLuminosities = normalizedFeatures.Select(f => f[1]);
var normalizedInfrareds = normalizedFeatures.Select(f => f[2]);
var normalizedDistances = normalizedFeatures.Select(f => f[3]);
var normalizedPIRs = normalizedFeatures.Select(f => f[4]);
var normalizedHumidities = normalizedFeatures.Select(f => f[5]);

var segmentationNormalizedFeatures = Chart.Plot(new[] {
    new Graph.Box { y = normalizedTemperatures, name = "Temperature" },
    new Graph.Box { y = normalizedLuminosities, name = "Luminosity" },
    new Graph.Box { y = normalizedInfrareds, name = "Infrared" },
    new Graph.Box { y = normalizedDistances, name = "Distance" },
    new Graph.Box { y = normalizedPIRs, name = "PIR" },
    new Graph.Box { y = normalizedHumidities, name = "Humidity" }
});

var layout = new Layout.Layout()
{
    title = "Box plot segmentation"
};
segmentationNormalizedFeatures.WithLayout(layout);

display(segmentationNormalizedFeatures);

### Build the training pipeline

In [1]:
var trainingPipeline = preprocessingPipeline
    .Append(mlContext.MulticlassClassification.Trainers.LightGbm("Label", "Features"));
    
var postprocessingPipeline = trainingPipeline
    .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));

var model = postprocessingPipeline.Fit(trainingData);    

### Permuation Feature Importance (PFI)

In [1]:
var modelForContributions = trainingPipeline.Fit(trainingData);
var transformedData = modelForContributions.Transform(testingData); // never do the PFI on training data!
var linearPredictor = modelForContributions.LastTransformer;

var permutationMetrics = mlContext.MulticlassClassification.PermutationFeatureImportance(linearPredictor, transformedData, permutationCount: 30);

var sortedIndices = permutationMetrics
    .Select((metrics, index) => new { index, metrics.MicroAccuracy })
    .OrderByDescending(feature => Math.Abs(feature.MicroAccuracy.Mean))
    .Select(feature => feature.index);
var microAccuracy = permutationMetrics.Select(x => x.MicroAccuracy).ToArray();

var pfiDiagram = Chart.Plot(new Graph.Bar
    {
        x = sortedIndices.Reverse().Select(i => microAccuracy[i].Mean),
        y = featureColumns.Reverse(), 
        orientation = "h"
    });
var layout = new Layout.Layout()
{
    title = "Permutation Feature Importance (PFI)"
};
pfiDiagram.WithLayout(layout);
display(pfiDiagram);