# C. PREPARE DATA
## (build data preparation pipeline and training pipeline)

#### Install packages / import namespaces

In [1]:
#r "nuget:Microsoft.ML,1.5.0-preview2"
using XPlot.Plotly;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;

# 1. LOAD DATA

#### Load data models from Models.cs file 

In [2]:
#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Models.cs"

#### Initialize the ML context (we need it for building the data and training pipelines)  

In [3]:
MLContext mlContext = new MLContext(0);

#### Load data from csv file into a dataview

In [4]:
const string DATASET_PATH = "./sensors_data.csv";
IDataView data = mlContext.Data.LoadFromTextFile<ModelInput>(
    path: DATASET_PATH,
    hasHeader: true,
    separatorChar: ',');

#### Shuffle and split data to _trainingData_ and _testingData_ by a fraction of 0.2

In [5]:
var shuffledData = mlContext.Data.ShuffleRows(data, seed: 0);
var split = mlContext.Data.TrainTestSplit(shuffledData, testFraction: 0.2);
var trainingData = split.TrainSet;
var testingData = split.TestSet;


#### Convert data to collection (examine data)

In [6]:
var features = mlContext.Data.CreateEnumerable<ModelInput>(trainingData, true);

display(features.Take(10));

index,Temperature,Luminosity,Infrared,Distance,CreatedAt,Source
0,24.82,50.49,0.0,8.84,01/03/2020 18:22:56,FlashLight
1,23.96,4.1,0.0,154.09,06/03/2020 21:31:55,Day
2,32.5,72.27,0.0,66.87,05/03/2020 11:29:22,FlashLight
3,47.32,100.0,94.34,41.55,04/03/2020 9:28:24,Lighter
4,25.09,12.11,0.0,400.0,04/03/2020 10:26:18,Day
5,56.03,100.0,92.29,400.0,04/03/2020 8:27:51,Lighter
6,32.18,11.72,15.43,60.87,05/03/2020 12:30:21,Infrared
7,23.28,45.51,0.0,12.08,02/03/2020 10:23:51,FlashLight
8,27.34,70.31,0.0,194.26,03/03/2020 19:24:48,FlashLight
9,24.77,18.55,0.0,400.0,03/03/2020 18:25:00,Day


#### Categorical distribution

In [7]:
var sources = features.Select(f => f.Source);
var temperatures = features.Select(f => f.Temperature);
var luminosities = features.Select(f => f.Luminosity);
var infrareds = features.Select(f => f.Infrared);
var distances = features.Select(f => f.Distance);
var hours = features.Select(f => DateTime.Parse(f.CreatedAt).Hour);
var days = features.Select(f => DateTime.Parse(f.CreatedAt).DayOfYear);

In [8]:
var categoriesHistogram = Chart.Plot(
    new Graph.Histogram { x = sources }
);

var layout = new Layout.Layout()
{
    title = "Categories distribution"
};
categoriesHistogram.WithLayout(layout);

display(categoriesHistogram);

#### Numerical distribution

In [9]:
var temperaturesHistogram = Chart.Plot(
    new Graph.Histogram { x = temperatures }
);

var layout = new Layout.Layout()
{
    title = "Temperature histogram"
};
temperaturesHistogram.WithLayout(layout);

display(temperaturesHistogram);

In [10]:
var luminositiesHistogram = Chart.Plot(
    new Graph.Histogram { x = luminosities }
);

var layout = new Layout.Layout()
{
    title = "Luminosity histogram"
};
luminositiesHistogram.WithLayout(layout);

display(luminositiesHistogram);

In [11]:
var infraredsHistogram = Chart.Plot(
    new Graph.Histogram { x = infrareds }
);

var layout = new Layout.Layout()
{
    title = "Infrared histogram"
};
infraredsHistogram.WithLayout(layout);

display(infraredsHistogram);

In [12]:
var distancesHistogram = Chart.Plot(
    new Graph.Histogram { x = distances }
);

var layout = new Layout.Layout()
{
    title = "Distance histogram"
};
distancesHistogram.WithLayout(layout);

display(distancesHistogram);

In [13]:
var hoursHistogram = Chart.Plot(
    new Graph.Histogram { x = hours }
);

var layout = new Layout.Layout()
{
    title = "Hour histogram"
};
hoursHistogram.WithLayout(layout);

display(hoursHistogram);

In [14]:
var daysHistogram = Chart.Plot(
    new Graph.Histogram { x = days }
);

var layout = new Layout.Layout()
{
    title = "Day histogram"
};
daysHistogram.WithLayout(layout);

display(daysHistogram);

#### All vs All 

In [15]:
var daysHistogram = Chart.Plot(new[] {
    new Graph.Scatter { x = temperatures, y = luminosities, mode = "markers", name = "Temperature vs Luminosity" },
    new Graph.Scatter { x = temperatures, y = infrareds, mode = "markers", name = "Temperature vs Infrared" },
    new Graph.Scatter { x = temperatures, y = distances, mode = "markers", name = "Temperature vs Distance" },
    new Graph.Scatter { x = luminosities, y = infrareds, mode = "markers", name = "Luminosity vs Infrared" },
    new Graph.Scatter { x = luminosities, y = distances, mode = "markers", name = "Luminosity vs Distance" },
    new Graph.Scatter { x = infrareds, y = distances, mode = "markers", name = "Infrared vs Distance" },
});

var layout = new Layout.Layout()
{
    title = "All vs All"
};
daysHistogram.WithLayout(layout);
display(daysHistogram);

#### Box plot segmentation

In [16]:
var segmentationDiagram = Chart.Plot(new[] {
    new Graph.Box { y = temperatures, name = "Temperature" },
    new Graph.Box { y = luminosities, name = "Luminosity" },
    new Graph.Box { y = infrareds, name = "Infrared" },
    new Graph.Box { y = distances, name = "Distance" },
    new Graph.Box { y = hours, name = "Hour of Day" },
    new Graph.Box { y = days, name = "Day of Year" }
});

var layout = new Layout.Layout()
{
    title = "Box plot segmentation"
};
segmentationDiagram.WithLayout(layout);

display(segmentationDiagram);

#### Correlation matrix

In [17]:
#r "nuget:MathNet.Numerics"

#### Load helpers (see file Helpers.cs)

In [18]:
#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Helpers.cs"

In [19]:
var featureColumns = new string[] { "Temperature", "Luminosity", "Infrared", "Distance", "Hour", "Day" };

var featureMatrix = new List<List<double>>();

featureMatrix.Add(temperatures.Select(Convert.ToDouble).ToList());
featureMatrix.Add(luminosities.Select(Convert.ToDouble).ToList());
featureMatrix.Add(infrareds.Select(Convert.ToDouble).ToList());
featureMatrix.Add(distances.Select(Convert.ToDouble).ToList());
featureMatrix.Add(hours.Select(Convert.ToDouble).ToList());
featureMatrix.Add(days.Select(Convert.ToDouble).ToList());

var correlationMatrix = Chart.Plot(
    new Graph.Heatmap 
    {
        x = featureColumns,
        y = featureColumns.Reverse(),
        z = Helpers.GetPearsonCorrelation(featureMatrix),
        zmin = -1,
        zmax = 1
    }
);

var layout = new Layout.Layout()
{
    autosize = "true", 
    margin =  new Graph.Margin{ l = 90 }, // fix left margin to accomodate longer labels
    title = "Features Correlation Matrix"
};
correlationMatrix.WithLayout(layout);
display(correlationMatrix);

# 2. PREPROCESSING PIPELINE
> Map value (string) to key (number)   
> Custom mapping  
> Concatenate features  
> Normalize features  
> Drop unused columns 

#### Selected features for building the model 

In [20]:
var featureColumns = new[] { "Temperature", "Luminosity", "Infrared", "Distance", "Hour", "Day" };

#### Build the preprocessing pipeline

In [21]:
var preprocessingPipeline = mlContext.Transforms.Conversion.MapValueToKey("Label")
    .Append(mlContext.Transforms.CustomMapping<CustomInputRow, CustomOutputRow>
        (CustomMappings.IncomeMapping, nameof(CustomMappings.IncomeMapping)))
    .Append(mlContext.Transforms.Concatenate("Features", featureColumns))
    .Append(mlContext.Transforms.NormalizeMinMax("Features"));

#### Box plot segmentation for normalized data

In [22]:
var normalizedData = preprocessingPipeline.Fit(trainingData).Transform(trainingData);
var normalizedFeatures = normalizedData.GetColumn<float[]>("Features").ToArray();

var normalizedTemperatures = normalizedFeatures.Select(f => f[0]);
var normalizedLuminosities = normalizedFeatures.Select(f => f[1]);
var normalizedInfrareds = normalizedFeatures.Select(f => f[2]);
var normalizedDistances = normalizedFeatures.Select(f => f[3]);
var normalizedHours = normalizedFeatures.Select(f => f[4]);
var normalizedDays = normalizedFeatures.Select(f => f[5]);

In [23]:
var segmentationNormalizedFeatures = Chart.Plot(new[] {
    new Graph.Box { y = normalizedTemperatures, name = "Temperature" },
    new Graph.Box { y = normalizedLuminosities, name = "Luminosity" },
    new Graph.Box { y = normalizedInfrareds, name = "Infrared" },
    new Graph.Box { y = normalizedDistances, name = "Distance" },
    new Graph.Box { y = normalizedHours, name = "Hour of Day" },
    new Graph.Box { y = normalizedDays, name = "Day of Year" }
});

var layout = new Layout.Layout()
{
    title = "Box plot segmentation"
};
segmentationNormalizedFeatures.WithLayout(layout);

display(segmentationNormalizedFeatures);

#### Build the preprocessing pipeline

In [24]:
var trainingPipeline = preprocessingPipeline
    .Append(mlContext.MulticlassClassification.Trainers.SdcaNonCalibrated("Label", "Features"));

#### Build the postprocessing pipeline

In [25]:
var postprocessingPipeline = trainingPipeline
    .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));

# 3. VALIDATE DATA

In [26]:
#r "nuget:Microsoft.Data.Analysis"
using Microsoft.AspNetCore.Html;
using Microsoft.Data.Analysis;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;
using System.Collections.Generic;
using static Microsoft.ML.TrainCatalogBase;

#### Loads Confusion Matrix Formatter (see Formatters.cs)

In [27]:
#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Formatters.cs"
var categories = new string[] { "FlashLight", "Infrared", "Day", "Lighter" };
Formatters.Load(categories);

In [32]:
var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(trainingData, postprocessingPipeline, numberOfFolds: 5, labelColumnName: "Label");
display(crossValidationResults.ToList())

CROSS-VALIDATION: multi-class classification,Average,Standard deviation,Confidence interval (95%)
MacroAccuracy,0.952,0.02,0.02
MicroAccuracy,0.949,0.019,0.018
LogLoss,17.039,9.949,9.75
LogLossReduction,-11.946,7.458,7.309


#### Permuation Feature Importance (PFI)

#### Model weights and biases

In [33]:
var categories = new string[] { "FlashLight", "Infrared", "Day", "Lighter" };

var modelForContributions = trainingPipeline.Fit(trainingData);
var parameters = Helpers.GetModelParameters(modelForContributions, categories);
display(parameters.Item1);
display(parameters.Item2);

key,Unnamed: 1
FlashLight,"[ -2.0155115, 3.0598867, -14.910331, -3.1781929, 0.21523052, -2.1322143 ]"
Infrared,"[ -7.99422, -11.5135, -15.14083, -1.429365, -1.376484, -2.1999965 ]"
Day,"[ 8.283228, 8.215717, 15.1907215, -1.644344, -1.5923918, -1.9654037 ]"
Lighter,"[ -2.4089074, -4.931906, 13.934696, -1.4483268, -5.540555, -0.9592686 ]"


key,value
FlashLight,-0.6820978
Infrared,5.051454
Day,-12.5575285
Lighter,1.4842105


#### Permuation Feature Importance (PFI)

In [34]:
var transformedData = modelForContributions.Transform(trainingData);
var linearPredictor = modelForContributions.LastTransformer;
var pfi = mlContext.MulticlassClassification.PermutationFeatureImportance(linearPredictor, transformedData, permutationCount: 3);
var sortedMetrics = pfi.Select((metrics, index) => new { index, metrics.MacroAccuracy })
    .OrderBy(feature => Math.Abs(feature.MacroAccuracy.Mean)).Select(feature => feature.MacroAccuracy.Mean);

In [35]:
var pfiDiagram = Chart.Plot(new Graph.Bar
    {
        x = sortedMetrics,
        y = featureColumns.Reverse(), 
        orientation = "h"
    });
var layout = new Layout.Layout()
{
    title = "Permuation Feature Importance (PFI)"
};
pfiDiagram.WithLayout(layout);
display(pfiDiagram);

# 4. TRAIN THE MODEL

In [36]:
var model = postprocessingPipeline.Fit(trainingData);

# 5. EVALUATE THE MODEL

In [37]:
var predictions = model.Transform(testingData);
var metrics = mlContext.MulticlassClassification.Evaluate(predictions, "Label", "Score", "PredictedLabel");

In [38]:
display(metrics)

EVALUATION: multi-class classification,Class,Value,Note
MacroAccuracy,,0.978,"the closer to 1, the better"
MicroAccuracy,,0.982,"the closer to 1, the better"
LogLoss,,24.03,"the closer to 0, the better"
LogLoss per Class,,,
LogLoss per Class,FlashLight,29.942,"the closer to 0, the better"
LogLoss per Class,Infrared,30.395,"the closer to 0, the better"
LogLoss per Class,Day,2.382,"the closer to 0, the better"
LogLoss per Class,Lighter,20.821,"the closer to 0, the better"


In [39]:
display(metrics.ConfusionMatrix);

0,1,2,3,4,5,6
Confusion Matrix,Confusion Matrix,Predicted,Predicted,Predicted,Predicted,
Confusion Matrix,Confusion Matrix,FlashLight,Infrared,Day,Lighter,Recall
Truth,FlashLight,66,1,0,0,0.9851
Truth,Infrared,0,49,0,0,1
Truth,Day,1,0,28,0,0.9655
Truth,Lighter,1,0,0,24,0.96
Precision,Precision,0.9706,0.98,1,1,total = 170


#### Save the trained model

In [40]:
mlContext.Model.Save(model, trainingData.Schema, "model.zip");