# ANALYZE DATA
### (numerical and categorical distribution, box plot segmentation and correlation matrix)

In [1]:
#r "nuget:Microsoft.ML,1.5.1"
using XPlot.Plotly;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;

Installed package Microsoft.ML version 1.5.1

#### Load data models from Models.csx file 

In [2]:
#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Models.csx"

#### Initialize the ML context (we need it for building the data and training pipelines)  

In [3]:
MLContext mlContext = new MLContext(seed: 1);

#### Load data from csv file into a dataview

In [4]:
const string DATASET_PATH = "./sensors_data.csv";
IDataView data = mlContext.Data.LoadFromTextFile<ModelInput>(
    path: DATASET_PATH,
    hasHeader: true,
    separatorChar: ',');

#### Shuffle and split data to _trainingData_ and _testingData_ by a fraction of 0.2

In [5]:
var shuffledData = mlContext.Data.ShuffleRows(data, seed: 0);
var split = mlContext.Data.TrainTestSplit(shuffledData, testFraction: 0.2);
var trainingData = split.TrainSet;
var testingData = split.TestSet;

#### Extract features

In [6]:
var features = mlContext.Data.CreateEnumerable<ModelInput>(trainingData, true);
var sources = features.Select(f => f.Source);
var temperatures = features.Select(f => f.Temperature);
var luminosities = features.Select(f => f.Luminosity);
var infrareds = features.Select(f => f.Infrared);
var distances = features.Select(f => f.Distance);
var hours = features.Select(f => DateTime.Parse(f.CreatedAt).Hour);
var days = features.Select(f => DateTime.Parse(f.CreatedAt).DayOfYear);

# 1. Numerical Distribution

In [8]:
var histogramsDiagram = Chart.Plot(new[] {
    new Graph.Histogram { x = temperatures, name = "Temperature" },
    new Graph.Histogram { x = luminosities, name = "Luminosity" },
    new Graph.Histogram { x = infrareds, name = "Infrared" },
    new Graph.Histogram { x = distances, name = "Distance" },
    new Graph.Histogram { x = hours, name = "Hour" },
    new Graph.Histogram { x = days, name = "Day" }
});

var layout = new Layout.Layout()
{
    title = "Numerical distribution"
};
histogramsDiagram.WithLayout(layout);

display(histogramsDiagram);

# 2. Categorical Distribution

In [9]:
var categoriesHistogram = Chart.Plot(
    new Graph.Histogram { x = sources }
);

var layout = new Layout.Layout()
{
    title = "Categorical distribution"
};
categoriesHistogram.WithLayout(layout);

display(categoriesHistogram);

In [10]:
var testingFeatures = mlContext.Data.CreateEnumerable<ModelInput>(testingData, true);
var testingSources = testingFeatures.Select(f => f.Source);

var categoriesHistogram = Chart.Plot(
    new Graph.Histogram { x = testingSources }
);

var layout = new Layout.Layout()
{
    title = "Categorical distribution (for testing data)"
};
categoriesHistogram.WithLayout(layout);

display(categoriesHistogram);

# 3. Box Plot Segmentation

In [11]:
var segmentationDiagram = Chart.Plot(new[] {
    new Graph.Box { y = temperatures, name = "Temperature" },
    new Graph.Box { y = luminosities, name = "Luminosity" },
    new Graph.Box { y = infrareds, name = "Infrared" },
    new Graph.Box { y = distances, name = "Distance" },
    new Graph.Box { y = hours, name = "Hour of Day" },
    new Graph.Box { y = days, name = "Day of Year" }
});

var layout = new Layout.Layout()
{
    title = "Box plot segmentation"
};
segmentationDiagram.WithLayout(layout);

display(segmentationDiagram);

# 4. Correlation Matrix

In [12]:
#r "nuget:MathNet.Numerics"

Installed package MathNet.Numerics version 4.12.0

In [13]:
#load "C:\Users\dcost\source\repos\SmartFireAlarm\SmartFireAlarm\Jupyter\Helpers.csx"





In [14]:
var featureColumns = new string[] { "Temperature", "Luminosity", "Infrared", "Distance", "Hour", "Day" };

var featureMatrix = new List<List<double>>();

featureMatrix.Add(temperatures.Select(Convert.ToDouble).ToList());
featureMatrix.Add(luminosities.Select(Convert.ToDouble).ToList());
featureMatrix.Add(infrareds.Select(Convert.ToDouble).ToList());
featureMatrix.Add(distances.Select(Convert.ToDouble).ToList());
featureMatrix.Add(hours.Select(Convert.ToDouble).ToList());
featureMatrix.Add(days.Select(Convert.ToDouble).ToList());

var correlationMatrix = Chart.Plot(
    new Graph.Heatmap 
    {
        x = featureColumns,
        y = featureColumns.Reverse(),
        z = Helpers.GetPearsonCorrelation(featureMatrix),
        zmin = -1,
        zmax = 1
    }
);

var layout = new Layout.Layout()
{
    autosize = "true", 
    margin =  new Graph.Margin{ l = 90 }, // fix left margin to accomodate longer labels
    title = "Features Correlation Matrix"
};
correlationMatrix.WithLayout(layout);
display(correlationMatrix);