[this doc on github](https://github.com/dotnet/interactive/tree/master/samples/notebooks/fsharp/Samples)

# Machine Learning over House Prices with ML.NET

### Reference the packages

In [None]:
#i "nuget:https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet5/nuget/v3/index.json" 
#i "nuget:https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-tools/nuget/v3/index.json" 

#r "nuget:Microsoft.ML,1.4.0"
#r "nuget:Microsoft.ML.AutoML,0.16.0"
#r "nuget:Microsoft.Data.Analysis,0.2.0"
#r "nuget: XPlot.Plotly.Interactive, 3.0.4"
    
open Microsoft.Data.Analysis
open XPlot.Plotly

### Adding better default formatting for data frames

Register a formatter for data frames and data frame rows.


In [None]:
module DateFrameFormatter = 
    
    // Locally open the F# HTML DSL.
    open Html

    let maxRows = 20

    Formatter.Register<DataFrame>((fun (context: FormatContext) (df: DataFrame) (writer: TextWriter) ->

        // Don't generate nested tables
        if context.ContentThreshold < 1.0 then false else

        // Ask other formatters to reduce information generation
        context.ReduceContent(0.2) |> ignore

        let take = 20
        table [] [
          thead [] [
            th [] [ str "Index" ]
            for c in df.Columns do
              th [] [ str c.Name]
          ]
          tbody [] [
            for i in 0 .. min maxRows (int df.Rows.Count - 1) do
              tr [] [
                td [] [ embed context i ]
                for o in df.Rows.[int64 i] do
                  td [] [ embed context o ]
              ]
          ]
        ]
        |> writer.Write

        true
    ), mimeType = "text/html")
    
    Formatter.Register<DataFrameRow>((fun (context: FormatContext) (row: DataFrameRow) (writer: TextWriter) ->

        // Don't generate nested tables
        if context.ContentThreshold < 1.0 then false else

        // Ask other formatters to reduce information generation
        context.ReduceContent(0.2) |> ignore

        table [] [
          tbody [] [
            tr [] [
              for o in row do
                td [] [ embed context o ] 
            ]
          ]
        ]
        |> writer.Write

        true
    ), mimeType = "text/html")
    

### Download the data

In [None]:
open System.Net.Http
let housingPath = "housing.csv"
if not(File.Exists(housingPath)) then
    let contents = HttpClient().GetStringAsync("https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv").Result
    File.WriteAllText("housing.csv", contents)

### Add the data to the data frame

In [None]:
let housingData = DataFrame.LoadCsv(housingPath)
housingData

In [None]:
housingData.Description()

### Display the data

In [None]:
let graph =
    Histogram(x = housingData.["median_house_value"],
              nbinsx = 20)
graph |> Chart.Plot

In [None]:
let graph =
    Graph.Scattergl(
        x = housingData.["longitude"],
        y = housingData.["latitude"],
        mode = "markers",
        marker =
            Graph.Marker(
                color = housingData.["median_house_value"],
                colorscale = "Jet"))

let plot = Chart.Plot(graph)
plot.Width <- 600
plot.Height <- 600
display(plot)

### Prepare the training and validation sets

In [None]:
module Array = 
    let shuffle (arr: 'T[]) =
        let rnd = Random()
        let arr = Array.copy arr
        for i in 0 .. arr.Length - 1 do
            let r = i + rnd.Next(arr.Length - i)
            let temp = arr.[r]
            arr.[r] <- arr.[i]
            arr.[i] <- temp
        arr

let randomIndices = [| 0 .. int housingData.Rows.Count - 1 |] |> Array.shuffle

let testSize = int (float (housingData.Rows.Count) * 0.1)
let trainRows = randomIndices.[testSize..]
let testRows = randomIndices.[..testSize - 1]

let housing_train = housingData.[trainRows]
let housing_test = housingData.[testRows]

display(housing_train.Rows.Count)
display(housing_test.Rows.Count)

### Create the regression model and train it

In [None]:
#!time

open Microsoft.ML
open Microsoft.ML.Data
open Microsoft.ML.AutoML

let mlContext = MLContext()

let experiment = mlContext.Auto().CreateRegressionExperiment(maxExperimentTimeInSeconds = 15u)
let result = experiment.Execute(housing_train, labelColumnName = "median_house_value")

### Display the training results

In [None]:
let scatters =
    result.RunDetails
     |> Seq.filter (fun d -> not (isNull d.ValidationMetrics))
     |> Seq.groupBy (fun r -> r.TrainerName)
     |> Seq.map (fun (name, details) ->
                Graph.Scattergl(
                    name = name,
                    x = (details |> Seq.map (fun r -> r.RuntimeInSeconds)),
                    y = (details |> Seq.map (fun r -> r.ValidationMetrics.MeanAbsoluteError)),
                    mode = "markers",
                    marker = Graph.Marker(size = 12)))

let chart = Chart.Plot(scatters)
chart.WithXTitle("Training Time")
chart.WithYTitle("Error")
display(chart)

Console.WriteLine("Best Trainer:{0}", result.BestRun.TrainerName);

### Validate and display the results 

In [None]:
let testResults = result.BestRun.Model.Transform(housing_test)

let trueValues = testResults.GetColumn<float32>("median_house_value")
let predictedValues = testResults.GetColumn<float32>("Score")

let predictedVsTrue =
    Graph.Scattergl(
        x = trueValues,
        y = predictedValues,
        mode = "markers")

let maximumValue = Math.Max(Seq.max trueValues, Seq.max predictedValues)

let perfectLine =
    Graph.Scattergl(
        x = [| 0.0f; maximumValue |],
        y = [| 0.0f; maximumValue |],
        mode = "lines")

let chart = Chart.Plot([| predictedVsTrue; perfectLine |])
chart.WithXTitle("True Values")
chart.WithYTitle("Predicted Values")
chart.WithLegend(false)
chart.Width = 600
chart.Height = 600
display(chart)