# Regression with Taxi Dataset

This notebook demonstrates how to:

1. Define the model input and output schema
1. Load in data from a text file to an IDataView
1. Set up the training pipeline with data transforms
1. Choose an algorithm and append it to the pipeline
1. Train the model
1. Evaluate the model
1. Consume the model

## Install the necessary NuGet packages for training ML.NET model and plotting:

In [1]:
// using nightly-build
#i "nuget:https://pkgs.dev.azure.com/dnceng/public/_packaging/MachineLearning/nuget/v3/index.json"
#r "nuget: Plotly.NET.Interactive, 3.0.2"
#r "nuget: Plotly.NET.CSharp, 0.0.1"
#r "nuget: Microsoft.ML.AutoML, 0.20.0-preview.22356.1"
#r "nuget: Microsoft.Data.Analysis, 0.20.0-preview.22356.1"

Loading extensions from `Plotly.NET.Interactive.dll`

Loading extensions from `Microsoft.ML.AutoML.Interactive.dll`

Loading extensions from `Microsoft.Data.Analysis.Interactive.dll`

In [1]:

// Import common usings.
using static Microsoft.DotNet.Interactive.Formatting.PocketViewTags;
using static Microsoft.ML.Transforms.OneHotEncodingEstimator;
using Microsoft.Data.Analysis;
using System;
using System.IO;
using Microsoft.ML;
using Microsoft.ML.AutoML;
using Microsoft.ML.Data;

#### Download or Locate Data
The following code tries to locate the data file in a few known locations or it will download it from the known GitHub location.

In [1]:
using System;
using System.IO;
using System.Net;

string EnsureDataSetDownloaded(string fileName)
{

	// This is the path if the repo has been checked out.
	var filePath = Path.Combine(Directory.GetCurrentDirectory(),"data", fileName);

	if (!File.Exists(filePath))
	{
		// This is the path if the file has already been downloaded.
		filePath = Path.Combine(Directory.GetCurrentDirectory(), fileName);
	}

	if (!File.Exists(filePath))
	{
		using (var client = new WebClient())
		{
			client.DownloadFile($"https://raw.githubusercontent.com/dotnet/csharp-notebooks/main/machine-learning/data/{fileName}", filePath);
		}
		Console.WriteLine($"Downloaded {fileName}  to : {filePath}");
	}
	else
	{
		Console.WriteLine($"{fileName} found here: {filePath}");
	}

	return filePath;
}

In [1]:
//Load File
var trainDataPath = EnsureDataSetDownloaded("taxi-fare.csv");
var df = DataFrame.LoadCsv(trainDataPath);
var mlContext = new MLContext();

// Append the trainer to the data processing pipeline
var pipeline = mlContext.Auto().Featurizer(df, excludeColumns: new[]{"fare_amount"})
                 .Append(mlContext.Auto().Regression(labelColumnName: "fare_amount"));

// Configure AutoML
var trainTestSplit = mlContext.Data.TrainTestSplit(df, 0.2);
var validateTestSplit = mlContext.Data.TrainTestSplit(trainTestSplit.TestSet, 0.5);
var monitor = new NotebookMonitor();

 var experiment = mlContext.Auto().CreateExperiment()
                    .SetPipeline(pipeline)
                    .SetTrainingTimeInSeconds(50)
                    .SetDataset(trainTestSplit.TrainSet, validateTestSplit.TrainSet)
                    .SetEvaluateMetric(RegressionMetric.RSquared, "fare_amount", "Score")
					.SetMonitor(monitor);

					// Configure Visualizer			
monitor.SetUpdate(monitor.Display());

// Start Experiment
var result = await experiment.RunAsync();

: (3,10): error CS0103: The name 'DataFrame' does not exist in the current context
(4,21): error CS0246: The type or namespace name 'MLContext' could not be found (are you missing a using directive or an assembly reference?)
(13,19): error CS0246: The type or namespace name 'NotebookMonitor' could not be found (are you missing a using directive or an assembly reference?)
(19,40): error CS0103: The name 'RegressionMetric' does not exist in the current context

## Consume the model

In [1]:
// Define sample data
var data  = new DataFrame(new StringDataFrameColumn("vendor_id"), new PrimitiveDataFrameColumn<float>("rate_code"), new PrimitiveDataFrameColumn<float>("passenger_count"), new PrimitiveDataFrameColumn<float>("trip_time_in_secs"), new PrimitiveDataFrameColumn<float>("trip_distance"),new StringDataFrameColumn("payment_type"));
data.Append(new List<KeyValuePair<string,object>>()
{
	new KeyValuePair<string,object>("vendor_id",@"CMT"),
	new KeyValuePair<string,object>("rate_code",1F),
	new KeyValuePair<string,object>("passenger_count",1F),
	new KeyValuePair<string, object> ("trip_time_in_secs",474F),
	new KeyValuePair<string, object> ("trip_distance",1.5F),
	new KeyValuePair<string, object> ("payment_type",@"payment_type")
},true);
var model = result.Model;

//Use the model to transform the sample data.
var output = model.Transform(data);

// Get the predicted score with the sample data.
var predictedScore = output.GetColumn<float>("Score");
predictedScore
			
			

index,value
0,8.47422


## Evaluate model

In [1]:
var model = result.Model;
var eval= model.Transform(validateTestSplit.TestSet);
var metric=mlContext.Regression.Evaluate(eval,"fare_amount");
metric	

MeanAbsoluteError,MeanSquaredError,RootMeanSquaredError,LossFunction,RSquared
0.9971379424072448,7.954096814346052,2.8203008375607825,7.954096719333472,0.913846968172443




The code below demonstrates several methods to explain your model, including how to get and display

1. A Histogram of the distribution of number of instances
1. A Scatter Plot
1. Compare actual values to predicted values in a scatter plot
1. The importance of different features

In [1]:
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.IO;
using System.Linq;
using Plotly.NET.CSharp;
using Plotly.NET;

## Compare Distribution of Number of Instances

In [1]:
// Extract some data into arrays for plotting

int numberOfRows = 1000;

// Columns was determined by inputted data
float[] fare_amount = df.GetColumn<float>("fare_amount").Take(numberOfRows).ToArray();

// Distribution of Number of Instances

Plotly.NET.Chart2D.Chart.Histogram<float,float, string>(fare_amount)
.WithXAxisStyle<float, float, string>(TitleText: "fare_amount")
.WithYAxisStyle<float, float, string>(TitleText: "Number of Instances")

## Compare actual values to predicted values in a scatter plot

In [1]:
// Number of rows to display in charts.
int numberOfRows = 1000;
// Use the model to make batch predictions on training data
var testResults = model.Transform(df);

// Get the actual values from the dataset
var trueValues = testResults.GetColumn<float>("fare_amount").Take(numberOfRows);;

// Get the predicted values from the test results
var predictedValues = testResults.GetColumn<float>("Score").Take(numberOfRows);

var scatter = Plotly.NET.CSharp.Chart.Scatter<float, float, string>(x: trueValues, y: predictedValues, mode: Plotly.NET.StyleParam.Mode.Markers)
.WithTraceInfo("True Values, Predicted Values", ShowLegend: true);

var maximumValue = Math.Max(trueValues.Max(), predictedValues.Max());
var perfectLine = Plotly.NET.CSharp.Chart.Scatter<float, float, string>(x:new[] {0, maximumValue}, y: new[] {0, maximumValue}, mode: Plotly.NET.StyleParam.Mode.Lines)
.WithTraceInfo("Perfect Line", ShowLegend: true);


Plotly.NET.CSharp.Chart.Combine(new []{scatter, perfectLine})
.WithXAxisStyle<double, double, string>(TitleText: "X", ShowGrid: false)

## Calculate and graph the Permutation Feature Importance (PFI)

In [1]:
// Calculate PFI
var preprocessedTrainData = model.Transform(df);

ImmutableDictionary<string, RegressionMetricsStatistics> permutationFeatureImportance =
    mlContext.Regression
	.PermutationFeatureImportance(
                model,
                preprocessedTrainData,
                labelColumnName: "fare_amount",
                useFeatureWeightFilter: false,
                numberOfExamplesToUse: null,
                permutationCount: 1);

var featureImportanceMetrics =
     permutationFeatureImportance
     .Select((kvp) => new { kvp.Key, kvp.Value.RSquared })
     .OrderByDescending(myFeatures => Math.Abs(myFeatures.RSquared.Mean));

    
var featureNames = new List<string>();
var featurePFI = new List<double>();
foreach (var feature in featureImportanceMetrics)
{
     featureNames.Add(feature.Key);
     featurePFI.Add(Math.Abs(feature.RSquared.Mean));
}
var featureImportance = new DataFrame(new StringDataFrameColumn("Feature", featureNames.ToArray() ), new DoubleDataFrameColumn("R-Squared Impact",featurePFI.ToArray()));
    
featureImportance

index,Feature,R-Squared Impact
0,trip_distance,0.6755704315755394
1,rate_code,0.4330715253461801
2,trip_time_in_secs,0.3044464463070636
3,vendor_id.Bit0,0.0
4,payment_type.Bit3,0.0
5,vendor_id.Bit1,0.0
6,payment_type.Bit1,0.0
7,payment_type.Bit0,0.0
8,vendor_id.Bit2,0.0
9,passenger_count,0.0


In [1]:
Plotly.NET.CSharp.Chart.Bar<double, string, string>(values: featurePFI, Keys: featureNames)
.WithTraceInfo(Name: "Hello from C#", ShowLegend: false)
.WithXAxisStyle<double, double, string>(TitleText: "Feature")
.WithYAxisStyle<double, double, string>(TitleText: "Contribution (delta R-Squared)")

In [1]:
var topFeatureName = featureNames.First();
float[] fare_amount = df.GetColumn<float>("fare_amount").Take(numberOfRows).ToArray();
float[] topFeature = df.GetColumn<float>(topFeatureName).Take(numberOfRows).ToArray();

Plotly.NET.CSharp.Chart.Point<float, float, string>(topFeature, fare_amount, "Hello")
.WithTraceInfo(Name: "Hello from C#", ShowLegend: true)
.WithXAxisStyle<float, float, string>(TitleText: topFeatureName, ShowGrid: false)
.WithYAxisStyle<float, float, string>(TitleText: "fare_amount", ShowGrid: false)