In [1]:
#r "nuget:bl=true"
#r "nuget:RestoreSources=https://dotnet.myget.org/F/dotnet-corefxlab/api/v3/index.json"
#r "nuget:Microsoft.ML,version=1.4.0-preview"
#r "nuget:Microsoft.ML.AutoML"
#r "nuget:Microsoft.Data.DataFrame,version=0.1.1-e191008-1"
    
open Microsoft.Data
open XPlot.Plotly
open Microsoft.AspNetCore.Html
open System.IO

yielding source C:\Users\kevinr\AppData\Local\Temp\nuget\8956\Project.fsproj.fsx


In [2]:
let register (df:DataFrame) (writer:TextWriter) =
    let headers = new ResizeArray<IHtmlContent> ()
    headers.Add(th.innerHTML(i.innerHTML("index")))
    headers.AddRange(df.Columns.Select(fun c -> (th.innerHTML(c) :> IHtmlContent)))
    let rows = ResizeArray<ResizeArray<IHtmlContent>>()
    let take = 20
    for i in 0 .. (Math.Min(take, int(df.RowCount)) - 1) do
        let cells = ResizeArray<IHtmlContent>()
        cells.Add(td.innerHTML(i));
        for o in df.[int64(i)] do
            cells.Add(td.innerHTML(o))
        rows.Add(cells)
    
    let t =
        table.innerHTML([|
            thead.innerHTML(headers)
            tbody.innerHTML(rows.Select(fun r -> tr.innerHTML(r)))
        |])

    writer.Write(t)

Formatter<DataFrame>.Register( (fun df writer -> register df writer), mimeType = "text/html")

In [3]:
open System.Net.Http
let housingPath = "housing.csv"
if not(File.Exists(housingPath)) then
    let contents = HttpClient().GetStringAsync("https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv").Result
    File.WriteAllText("housing.csv", contents)

In [4]:
let housingData = DataFrame.ReadCsv(housingPath)
housingData

index,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280,565,259,3.8462,342200,NEAR BAY
5,-122.25,37.85,52,919,213,413,193,4.0368,269700,NEAR BAY
6,-122.25,37.84,52,2535,489,1094,514,3.6591,299200,NEAR BAY
7,-122.25,37.84,52,3104,687,1157,647,3.12,241400,NEAR BAY
8,-122.26,37.84,42,2555,665,1206,595,2.0804,226700,NEAR BAY
9,-122.25,37.84,52,3549,707,1551,714,3.6912,261100,NEAR BAY


In [5]:
housingData.Description()

index,Description,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,Length,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
1,Max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0
2,Min,-124.35,32.54,1.0,2.0,0.0,3.0,1.0,0.4999,14999.0
3,Mean,-119.569115,35.631866,28.639486,2635.7588,532.4762,1425.4779,499.53967,3.8706622,206854.97


In [6]:
let graph =
    Histogram(x = housingData.["median_house_value"],
              nbinsx = 20)
graph |> Chart.Plot

In [7]:
let graph =
    Graph.Scattergl(
        x = housingData.["longitude"],
        y = housingData.["latitude"],
        mode = "markers",
        marker =
            Graph.Marker(
                color = housingData.["median_house_value"],
                colorscale = "Jet"))

let plot = Chart.Plot(graph)
plot.Width <- 600
plot.Height <- 600
display(plot)

In [8]:
let Shuffle (arr:int[]) =
    let rnd = Random()
    for i in 0 .. arr.Length - 1 do
        let r = i + rnd.Next(arr.Length - i)
        let temp = arr.[r]
        arr.[r] <- arr.[i]
        arr.[i] <- temp
    arr

let randomIndices = (Shuffle(Enumerable.Range(0, (int (housingData.RowCount) - 1)).ToArray()))

let testSize = int (float (housingData.RowCount) * 0.1)
let trainRows = randomIndices.[testSize..]
let testRows = randomIndices.[..testSize]

let housing_train = housingData.[trainRows]
let housing_test = housingData.[testRows]

display(housing_train.RowCount)
display(housing_test.RowCount)

In [9]:
%%time

open Microsoft.ML
open Microsoft.ML.Data
open Microsoft.ML.AutoML

let mlContext = MLContext()

let experiment = mlContext.Auto().CreateRegressionExperiment(maxExperimentTimeInSeconds = 15u)
let result = experiment.Execute(housing_train, labelColumnName = "median_house_value")

Wall time: 15346.6579ms

In [10]:
type RunDetails = System.Collections.Generic.IEnumerable<RunDetail<RegressionMetrics>>
let scatters =
    result.RunDetails
        .Where(fun d -> not (d.ValidationMetrics = null))
        .GroupBy(
            (fun r -> r.TrainerName),
            (fun (name:string) (details:RunDetails) -> 
                Graph.Scattergl(
                    name = name,
                    x = details.Select(fun r -> r.RuntimeInSeconds),
                    y = details.Select(fun r -> r.ValidationMetrics.MeanAbsoluteError),
                    mode = "markers",
                    marker = Graph.Marker(size = 12))))

let chart = Chart.Plot(scatters)
chart.WithXTitle("Training Time")
chart.WithYTitle("Error")
display(chart)

Console.WriteLine("Best Trainer:{0}", result.BestRun.TrainerName);

Best Trainer:FastTreeTweedieRegression


In [11]:
let testResults = result.BestRun.Model.Transform(housing_test)

let trueValues = testResults.GetColumn<float32>("median_house_value")
let predictedValues = testResults.GetColumn<float32>("Score")

let predictedVsTrue =
    Graph.Scattergl(
        x = trueValues,
        y = predictedValues,
        mode = "markers")

let maximumValue = Math.Max(trueValues.Max(), predictedValues.Max())

let perfectLine =
    Graph.Scattergl(
        x = [| 0.0f; maximumValue |],
        y = [| 0.0f; maximumValue |],
        mode = "lines")

let chart = Chart.Plot([| predictedVsTrue; perfectLine |])
chart.WithXTitle("True Values")
chart.WithYTitle("Predicted Values")
chart.WithLegend(false)
chart.Width = 600
chart.Height = 600
display(chart)