# Regression model for Taxi fares using ML .NET

Regression is a ML task type of supervised machine learning algorithms. 
A regression ML model predicts continuous value outputs (such as numbers). 
For instance, predicting the fare of a Taxi trip or predicting the price of a car is a regression problem.

# Install the NuGet packages you use in the notebook

In [40]:
#r "nuget:Microsoft.ML,1.4.0"
#r "nuget:Microsoft.Data.Analysis,0.1.0"
#r "nuget:XPlot.Plotly,2.0.0"
#r "nuget:Microsoft.ML.Mkl.Components,1.4.0"
using XPlot.Plotly;
using Microsoft.Data.Analysis;
using Microsoft.ML;
using Microsoft.ML.Data;

# Declare data-classes for input data and predictions

In [4]:
display(h4("Declaring data-classes to use across the code in this notebook."));

public class TaxiTrip
{
    [LoadColumn(0)]
    public string VendorId;

    [LoadColumn(1)]
    public string RateCode;

    [LoadColumn(2)]
    public float PassengerCount;

    [LoadColumn(3)]
    public float TripTime;

    [LoadColumn(4)]
    public float TripDistance;

    [LoadColumn(5)]
    public string PaymentType;

    [LoadColumn(6)]
    public float FareAmount;
}

public class TaxiTripFarePrediction
{
    [ColumnName("Score")]
    public float Score;
}

In [26]:
using Microsoft.AspNetCore.Html;
using System.Collections;
Formatter<DataDebuggerPreview>.Register((data, writer) =>
{
    var headers = new List<IHtmlContent>();
    headers.Add(th(i("index")));
    headers.AddRange(data.ColumnView.Where(col => !col.Column.IsHidden).Select(c => (IHtmlContent) th(c.Column.Name)));
    var colIndices = new List<int>();
    int index = 0;
    foreach(var col in data.ColumnView)
    {
        if(!col.Column.IsHidden)
        {
            colIndices.Add(index);
        }
        
        index++;
    }
    
    var rows = new List<List<IHtmlContent>>();
    var take = 100;
    for (var i = 0; i < Math.Min(take, data.RowView.Length); i++)
    {
        var cells = new List<IHtmlContent>();
        cells.Add(td(i));
        foreach (var localIndex in colIndices)
        {
            var value = data.RowView[i].Values[localIndex];
            if(value.Value.GetType().IsGenericType && value.Value.GetType().GetGenericTypeDefinition() == typeof(VBuffer<>))
            {
                var denseVals = (IEnumerable)(value.Value.GetType().GetMethod("DenseValues").Invoke(value.Value, null));
                List<string> innerValList = new List<string>();
                foreach(var innerVal in denseVals)
                {
                    innerValList.Add(Convert.ToString(innerVal));
                }
                var v = string.Join(",", innerValList);
                cells.Add(td(v));
            }
            else
                cells.Add(td(value.Value));
        }
        rows.Add(cells);
    }
    
    var t = table(
        thead(
            headers),
        tbody(
            rows.Select(
                r => tr(r))));
    
    writer.Write(t);
}, "text/html");

# Load datasets into IDataView and display the schema 

In [6]:
display(h1("Code for loading the data into IDataViews: training dataset and test dataset"));

MLContext mlContext = new MLContext(seed: 0);

string TrainDataPath = "./taxi-fare-train.csv";
string TestDataPath = "./taxi-fare-test.csv";

IDataView trainDataView = mlContext.Data.LoadFromTextFile<TaxiTrip>(TrainDataPath, hasHeader: true, separatorChar: ',');
IDataView testDataView = mlContext.Data.LoadFromTextFile<TaxiTrip>(TestDataPath, hasHeader: true, separatorChar: ',');

display(h4("Schema of training DataView:"));
display(trainDataView.Schema);


index,Name,Index,IsHidden,Type,Annotations
0,VendorId,0,False,{ Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory<System.Char> },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }
1,RateCode,1,False,{ Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory<System.Char> },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }
2,PassengerCount,2,False,{ Microsoft.ML.Data.NumberDataViewType: RawType: System.Single },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }
3,TripTime,3,False,{ Microsoft.ML.Data.NumberDataViewType: RawType: System.Single },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }
4,TripDistance,4,False,{ Microsoft.ML.Data.NumberDataViewType: RawType: System.Single },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }
5,PaymentType,5,False,{ Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory<System.Char> },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }
6,FareAmount,6,False,{ Microsoft.ML.Data.NumberDataViewType: RawType: System.Single },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }


## Show a few rows of loaded data 

In [8]:
display(string.Format("DataView: Showing 10 rows with the columns"));
display(mlContext.Data.CreateEnumerable<TaxiTrip>(trainDataView, reuseRowObject: false).Take(10).ToList());

DataView: Showing 10 rows with the columns

index,VendorId,RateCode,PassengerCount,TripTime,TripDistance,PaymentType,FareAmount
0,CMT,1,1,1271,3.8,CRD,17.5
1,CMT,1,1,474,1.5,CRD,8.0
2,CMT,1,1,637,1.4,CRD,8.5
3,CMT,1,1,181,0.6,CSH,4.5
4,CMT,1,1,661,1.1,CRD,8.5
5,CMT,1,1,935,9.6,CSH,27.5
6,CMT,1,1,869,2.3,CRD,11.5
7,CMT,1,1,454,1.4,CRD,7.5
8,CMT,1,1,366,1.5,CSH,7.5
9,CMT,1,1,252,0.6,CSH,5.0


# Data transformations pipeline for ML.NET model

In [27]:
display(h1("Apply Data Transformations pipeline"));

var dataProcessPipeline = mlContext.Transforms.Categorical.OneHotEncoding(outputColumnName: "VendorIdEncoded", inputColumnName: nameof(TaxiTrip.VendorId))
                  .Append(mlContext.Transforms.Categorical.OneHotEncoding(outputColumnName: "RateCodeEncoded", inputColumnName: nameof(TaxiTrip.RateCode)))
                  .Append(mlContext.Transforms.Categorical.OneHotEncoding(outputColumnName: "PaymentTypeEncoded",inputColumnName: nameof(TaxiTrip.PaymentType)))
                  .Append(mlContext.Transforms.NormalizeMeanVariance(outputColumnName: nameof(TaxiTrip.PassengerCount)))
                  .Append(mlContext.Transforms.NormalizeMeanVariance(outputColumnName: nameof(TaxiTrip.TripTime)))
                  .Append(mlContext.Transforms.NormalizeMeanVariance(outputColumnName: nameof(TaxiTrip.TripDistance)))
                  .Append(mlContext.Transforms.Concatenate("Features", "VendorIdEncoded", "RateCodeEncoded", "PaymentTypeEncoded", 
                                                           nameof(TaxiTrip.PassengerCount), nameof(TaxiTrip.TripTime), nameof(TaxiTrip.TripDistance)));

display(h3("Show transformed data..."));

var transformation = dataProcessPipeline.Fit(trainDataView).Transform(trainDataView);
transformation.Preview()


index,VendorId,RateCode,PassengerCount,TripTime,TripDistance,PaymentType,FareAmount,VendorIdEncoded,RateCodeEncoded,PaymentTypeEncoded,Features
0,CMT,1,0.42661828,1.514717,0.87563246,CRD,17.5,10,100000,10000,"1,0,1,0,0,0,0,0,1,0,0,0,0,0.42661828,1.514717,0.87563246"
1,CMT,1,0.42661828,0.5648905,0.3456444,CRD,8.0,10,100000,10000,"1,0,1,0,0,0,0,0,1,0,0,0,0,0.42661828,0.5648905,0.3456444"
2,CMT,1,0.42661828,0.7591461,0.32260144,CRD,8.5,10,100000,10000,"1,0,1,0,0,0,0,0,1,0,0,0,0,0.42661828,0.7591461,0.32260144"
3,CMT,1,0.42661828,0.21570714,0.13825777,CSH,4.5,10,100000,1000,"1,0,1,0,0,0,0,0,0,1,0,0,0,0.42661828,0.21570714,0.13825777"
4,CMT,1,0.42661828,0.78774816,0.25347257,CRD,8.5,10,100000,10000,"1,0,1,0,0,0,0,0,1,0,0,0,0,0.42661828,0.78774816,0.25347257"
5,CMT,1,0.42661828,1.1142882,2.2121243,CSH,27.5,10,100000,1000,"1,0,1,0,0,0,0,0,0,1,0,0,0,0.42661828,1.1142882,2.2121243"
6,CMT,1,0.42661828,1.0356326,0.52998805,CRD,11.5,10,100000,10000,"1,0,1,0,0,0,0,0,1,0,0,0,0,0.42661828,1.0356326,0.52998805"
7,CMT,1,0.42661828,0.5410555,0.32260144,CRD,7.5,10,100000,10000,"1,0,1,0,0,0,0,0,1,0,0,0,0,0.42661828,0.5410555,0.32260144"
8,CMT,1,0.42661828,0.43618128,0.3456444,CSH,7.5,10,100000,1000,"1,0,1,0,0,0,0,0,0,1,0,0,0,0.42661828,0.43618128,0.3456444"
9,CMT,1,0.42661828,0.30032155,0.13825777,CSH,5.0,10,100000,1000,"1,0,1,0,0,0,0,0,0,1,0,0,0,0.42661828,0.30032155,0.13825777"


# Append the trainer/algorithm to pipeline and train the model

In [41]:
%%time
display(h1("Build Training Pipeline and Train the model"));
display(h4("Creating the Training Pipeline with trainer/algorithm"));

// STEP 3: Set the training algorithm - Selected Trainer (SDCA Regression algorithm)                            
var trainer = mlContext.Regression.Trainers.Ols(labelColumnName: "FareAmount", featureColumnName: "Features");
var trainingPipeline = dataProcessPipeline.Append(trainer);

// STEP 4: Train the model fitting to the DataSet
//The pipeline is trained on the dataset that has been loaded and transformed.
display("=============== Training the model ===============");
var trainedModel = trainingPipeline.Fit(trainDataView);



Wall time: 836.6623000000001ms

## Make predictions in bulk from the TestDataset to be used for the metrics

In [42]:
// Make predictions to plot agaist actual values
display(h3("===== Making predictions in bulk for the whole Test Dataset ====="));
// Make predictions in bulk (Transformed IDataView will have the predictions plus the actual/true values)
IDataView predictionsDataView = trainedModel.Transform(testDataView);
predictionsDataView.Preview()

index,VendorId,RateCode,PassengerCount,TripTime,TripDistance,PaymentType,FareAmount,VendorIdEncoded,RateCodeEncoded,PaymentTypeEncoded,Features,Score
0,VTS,1,0.42661828,1.3585974,0.864111,CRD,15.5,1,100000,10000,"0,1,1,0,0,0,0,0,1,0,0,0,0,0.42661828,1.3585974,0.864111",15.772648
1,VTS,1,0.42661828,0.57204103,0.6267685,CRD,10.0,1,100000,10000,"0,1,1,0,0,0,0,0,1,0,0,0,0,0.42661828,0.57204103,0.6267685",10.052195
2,VTS,1,0.42661828,2.0021436,1.797351,CSH,26.5,1,100000,1000,"0,1,1,0,0,0,0,0,0,1,0,0,0,0.42661828,2.0021436,1.797351",25.492914
3,VTS,1,0.42661828,0.7150513,1.0899321,CSH,14.5,1,100000,1000,"0,1,1,0,0,0,0,0,0,1,0,0,0,0.42661828,0.7150513,1.0899321",13.878612
4,VTS,1,0.42661828,0.7150513,0.50233656,CRD,9.5,1,100000,10000,"0,1,1,0,0,0,0,0,1,0,0,0,0,0.42661828,0.7150513,0.50233656",9.908529
5,VTS,1,0.42661828,1.5016077,2.3803377,CSH,29.5,1,100000,1000,"0,1,1,0,0,0,0,0,0,1,0,0,0,0.42661828,1.5016077,2.3803377",27.037437
6,VTS,1,0.42661828,0.7150513,0.4631635,CSH,9.0,1,100000,1000,"0,1,1,0,0,0,0,0,0,1,0,0,0,0.42661828,0.7150513,0.4631635",9.451395
7,VTS,1,0.42661828,0.57204103,0.3456444,CRD,7.5,1,100000,10000,"0,1,1,0,0,0,0,0,1,0,0,0,0,0.42661828,0.57204103,0.3456444",8.066458
8,VTS,1,0.42661828,0.7865564,0.5737697,CSH,10.5,1,100000,1000,"0,1,1,0,0,0,0,0,0,1,0,0,0,0.42661828,0.7865564,0.5737697",10.600302
9,VTS,1,0.42661828,0.42903078,0.26038545,CRD,6.0,1,100000,10000,"0,1,1,0,0,0,0,0,1,0,0,0,0,0.42661828,0.42903078,0.26038545",6.728958


## Display the metrics (Model quality evaluation)

In [31]:
display(h3("===== Evaluating Model's accuracy with Test dataset ====="));

var metrics = mlContext.Regression.Evaluate(predictionsDataView, labelColumnName: "FareAmount", scoreColumnName: "Score");
display(metrics);

MeanAbsoluteError,MeanSquaredError,RootMeanSquaredError,LossFunction,RSquared
0.7499811852455139,35.36469950920185,5.946822639796974,35.36470081938095,0.698016117105618


## Bar chart showing 'Actual fares vs. Predicted fares Comparison' 

In [44]:
// Number of rows to use for Bar chart
int totalNumberForBarChart  = 20;

float[] actualFares = predictionsDataView.GetColumn<float>("FareAmount").Take(totalNumberForBarChart).ToArray();
float[] predictionFares = predictionsDataView.GetColumn<float>("Score").Take(totalNumberForBarChart).ToArray();
int[] elements = Enumerable.Range(0, totalNumberForBarChart).ToArray();

// Define group for Actual values 
var ActualValuesGroupBarGraph = new Graph.Bar()
{
    x = elements,
    y = actualFares,
    name = "Actual"
};

// Define group for Prediction values 
var PredictionValuesGroupBarGraph = new Graph.Bar()
{
    x = elements,
    y = predictionFares,
    name = "Predicted"
};

var chart = Chart.Plot(new[] {ActualValuesGroupBarGraph, PredictionValuesGroupBarGraph});

var layout = new Layout.Layout(){barmode = "group", title="Actual fares vs. Predicted fares Comparison"};
chart.WithLayout(layout);
chart.WithXTitle("Cases");
chart.WithYTitle("Fare");
chart.WithLegend(true);
chart.Width = 700;
chart.Height = 400;

display(chart);

# Save the ML model as a file

In [35]:
display(h1("Saving the ML.NET Model as a file..."));

string modelFilePath = "./MLRegressionModel.zip";

// GetAbsolutePath(modelRelativePath)
mlContext.Model.Save(trainedModel, trainDataView.Schema, modelFilePath);

display(h3($"The model was saved to: {modelFilePath}"));
