# PREPARE DATA
## (build data preparation pipeline and training pipeline)

#### Install packages / import namespaces

In [None]:
#r "nuget:Microsoft.ML,1.5.0"
using XPlot.Plotly;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;

# 1. LOAD DATA

#### Load data models from Models.csx file 

In [None]:
#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Models.csx"

#### Initialize the ML context (we need it for building the data and training pipelines)  

In [None]:
MLContext mlContext = new MLContext(0);

#### Load data from csv file into a dataview

In [None]:
const string DATASET_PATH = "./sensors_data.csv";
IDataView data = mlContext.Data.LoadFromTextFile<ModelInput>(
    path: DATASET_PATH,
    hasHeader: true,
    separatorChar: ',');

#### Shuffle and split data to _trainingData_ and _testingData_ by a fraction of 0.2

In [None]:
var shuffledData = mlContext.Data.ShuffleRows(data, seed: 0);
var split = mlContext.Data.TrainTestSplit(shuffledData, testFraction: 0.2);
var trainingData = split.TrainSet;
var testingData = split.TestSet;


#### Convert data to collection (examine data)

In [None]:
var features = mlContext.Data.CreateEnumerable<ModelInput>(trainingData, true);

display(features.Take(10));

#### Categorical distribution

In [None]:
var sources = features.Select(f => f.Source);
var temperatures = features.Select(f => f.Temperature);
var luminosities = features.Select(f => f.Luminosity);
var infrareds = features.Select(f => f.Infrared);
var distances = features.Select(f => f.Distance);
var hours = features.Select(f => DateTime.Parse(f.CreatedAt).Hour);
var days = features.Select(f => DateTime.Parse(f.CreatedAt).DayOfYear);

In [None]:
var categoriesHistogram = Chart.Plot(
    new Graph.Histogram { x = sources }
);

var layout = new Layout.Layout()
{
    title = "Categories distribution"
};
categoriesHistogram.WithLayout(layout);

display(categoriesHistogram);

In [None]:
var testingFeatures = mlContext.Data.CreateEnumerable<ModelInput>(testingData, true);
var testingSources = testingFeatures.Select(f => f.Source);

var categoriesHistogram = Chart.Plot(
    new Graph.Histogram { x = testingSources }
);

var layout = new Layout.Layout()
{
    title = "Categories distribution (testing data)"
};
categoriesHistogram.WithLayout(layout);

display(categoriesHistogram);

#### Numerical distribution - temperature

In [None]:
var temperaturesHistogram = Chart.Plot(
    new Graph.Histogram { x = temperatures }
);

var layout = new Layout.Layout()
{
    title = "Temperature histogram"
};
temperaturesHistogram.WithLayout(layout);

display(temperaturesHistogram);

In [None]:
var luminositiesHistogram = Chart.Plot(
    new Graph.Histogram { x = luminosities }
);

var layout = new Layout.Layout()
{
    title = "Luminosity histogram"
};
luminositiesHistogram.WithLayout(layout);

display(luminositiesHistogram);

In [None]:
var infraredsHistogram = Chart.Plot(
    new Graph.Histogram { x = infrareds }
);

var layout = new Layout.Layout()
{
    title = "Infrared histogram"
};
infraredsHistogram.WithLayout(layout);

display(infraredsHistogram);

In [None]:
var distancesHistogram = Chart.Plot(
    new Graph.Histogram { x = distances }
);

var layout = new Layout.Layout()
{
    title = "Distance histogram"
};
distancesHistogram.WithLayout(layout);

display(distancesHistogram);

In [None]:
var hoursHistogram = Chart.Plot(
    new Graph.Histogram { x = hours }
);

var layout = new Layout.Layout()
{
    title = "Hour histogram"
};
hoursHistogram.WithLayout(layout);

display(hoursHistogram);

In [None]:
var daysHistogram = Chart.Plot(
    new Graph.Histogram { x = days }
);

var layout = new Layout.Layout()
{
    title = "Day histogram"
};
daysHistogram.WithLayout(layout);

display(daysHistogram);

#### Box plot segmentation

In [None]:
var segmentationDiagram = Chart.Plot(new[] {
    new Graph.Box { y = temperatures, name = "Temperature" },
    new Graph.Box { y = luminosities, name = "Luminosity" },
    new Graph.Box { y = infrareds, name = "Infrared" },
    new Graph.Box { y = distances, name = "Distance" },
    new Graph.Box { y = hours, name = "Hour of Day" },
    new Graph.Box { y = days, name = "Day of Year" }
});

var layout = new Layout.Layout()
{
    title = "Box plot segmentation"
};
segmentationDiagram.WithLayout(layout);

display(segmentationDiagram);

#### Correlation matrix

In [None]:
#r "nuget:MathNet.Numerics"

#### Load helpers (see file Helpers.csx)

In [None]:
#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Helpers.csx"

In [None]:
var featureColumns = new string[] { "Temperature", "Luminosity", "Infrared", "Distance", "Hour", "Day" };

var featureMatrix = new List<List<double>>();

featureMatrix.Add(temperatures.Select(Convert.ToDouble).ToList());
featureMatrix.Add(luminosities.Select(Convert.ToDouble).ToList());
featureMatrix.Add(infrareds.Select(Convert.ToDouble).ToList());
featureMatrix.Add(distances.Select(Convert.ToDouble).ToList());
featureMatrix.Add(hours.Select(Convert.ToDouble).ToList());
featureMatrix.Add(days.Select(Convert.ToDouble).ToList());

var correlationMatrix = Chart.Plot(
    new Graph.Heatmap 
    {
        x = featureColumns,
        y = featureColumns.Reverse(),
        z = Helpers.GetPearsonCorrelation(featureMatrix),
        zmin = -1,
        zmax = 1
    }
);

var layout = new Layout.Layout()
{
    autosize = "true", 
    margin =  new Graph.Margin{ l = 90 }, // fix left margin to accomodate longer labels
    title = "Features Correlation Matrix"
};
correlationMatrix.WithLayout(layout);
display(correlationMatrix);

# 2. PREPROCESSING PIPELINE
> Map value (string) to key (number)   
> Custom mapping  
> Concatenate features  
> Normalize features  
> Drop unused columns 

#### Selected features for building the model 

In [None]:
var featureColumns = new[] { "Temperature", "Luminosity", "Infrared", "Distance", "Hour", "Day" };

#### Build the preprocessing pipeline

In [None]:
var preprocessingPipeline = mlContext.Transforms.Conversion.MapValueToKey("Label")
    .Append(mlContext.Transforms.CustomMapping<CustomInputRow, CustomOutputRow>
        (CustomMappings.IncomeMapping, nameof(CustomMappings.IncomeMapping)))
    .Append(mlContext.Transforms.Concatenate("Features", featureColumns))
    .Append(mlContext.Transforms.NormalizeMinMax("Features"));

#### Box plot segmentation for normalized data

In [None]:
var normalizedData = preprocessingPipeline.Fit(trainingData).Transform(trainingData);
var normalizedFeatures = normalizedData.GetColumn<float[]>("Features").ToArray();

var normalizedTemperatures = normalizedFeatures.Select(f => f[0]);
var normalizedLuminosities = normalizedFeatures.Select(f => f[1]);
var normalizedInfrareds = normalizedFeatures.Select(f => f[2]);
var normalizedDistances = normalizedFeatures.Select(f => f[3]);
var normalizedHours = normalizedFeatures.Select(f => f[4]);
var normalizedDays = normalizedFeatures.Select(f => f[5]);

In [None]:
var segmentationNormalizedFeatures = Chart.Plot(new[] {
    new Graph.Box { y = normalizedTemperatures, name = "Temperature" },
    new Graph.Box { y = normalizedLuminosities, name = "Luminosity" },
    new Graph.Box { y = normalizedInfrareds, name = "Infrared" },
    new Graph.Box { y = normalizedDistances, name = "Distance" },
    new Graph.Box { y = normalizedHours, name = "Hour of Day" },
    new Graph.Box { y = normalizedDays, name = "Day of Year" }
});

var layout = new Layout.Layout()
{
    title = "Box plot segmentation"
};
segmentationNormalizedFeatures.WithLayout(layout);

display(segmentationNormalizedFeatures);

#### Build the pre-processing pipeline

In [None]:
var trainingPipeline = preprocessingPipeline
    .Append(mlContext.MulticlassClassification.Trainers.SdcaNonCalibrated("Label", "Features"));

#### Build the post-processing pipeline

In [None]:
var postprocessingPipeline = trainingPipeline
    .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));

# 3. VALIDATE MODEL

In [None]:
#r "nuget:Microsoft.Data.Analysis"
using Microsoft.AspNetCore.Html;
using Microsoft.Data.Analysis;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;
using System.Collections.Generic;
using static Microsoft.ML.TrainCatalogBase;

#### Loads Confusion Matrix Formatter (see Formatters.csx)

In [None]:
#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Formatters.csx"
var categories = new string[] { "FlashLight", "Infrared", "Day", "Lighter" };
Formatters.Load(categories);

In [None]:
var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(trainingData, postprocessingPipeline, numberOfFolds: 5, labelColumnName: "Label");
display(crossValidationResults.ToList())

#### Permuation Feature Importance (PFI)

#### Model weights and biases

In [None]:
var categories = new string[] { "FlashLight", "Infrared", "Day", "Lighter" };

var modelForContributions = trainingPipeline.Fit(testingData);
var parameters = Helpers.GetModelParameters(modelForContributions, categories);
display(parameters.Item1);
display(parameters.Item2);

#### Permuation Feature Importance (PFI)

In [None]:
var transformedData = modelForContributions.Transform(testingData); // never do the PFI on training data!
var linearPredictor = modelForContributions.LastTransformer;
var pfi = mlContext.MulticlassClassification.PermutationFeatureImportance(linearPredictor, transformedData, permutationCount: 3);
var sortedMetrics = pfi.Select((metrics, index) => new { index, metrics.MacroAccuracy })
    .OrderBy(feature => Math.Abs(feature.MacroAccuracy.Mean)).Select(feature => feature.MacroAccuracy.Mean);

In [None]:
var pfiDiagram = Chart.Plot(new Graph.Bar
    {
        x = sortedMetrics,
        y = featureColumns.Reverse(), 
        orientation = "h"
    });
var layout = new Layout.Layout()
{
    title = "Permuation Feature Importance (PFI)"
};
pfiDiagram.WithLayout(layout);
display(pfiDiagram);

# 4. TRAIN THE MODEL

In [None]:
var model = postprocessingPipeline.Fit(trainingData);

# 5. EVALUATE THE MODEL

In [None]:
var predictions = model.Transform(testingData);
var metrics = mlContext.MulticlassClassification.Evaluate(predictions, "Label", "Score", "PredictedLabel");

In [None]:
display(metrics)

In [None]:
display(metrics.ConfusionMatrix);

#### Save the trained model

In [None]:
mlContext.Model.Save(model, trainingData.Schema, "model.zip");