[this doc on github](https://github.com/dotnet/interactive/tree/master/samples/notebooks/fsharp/Samples)

This demonstrates the use of `Microsoft.Data.Analysis` data frames with F#.You can open this example online using [MyBinder](https://mybinder.org/v2/gh/dotnet/interactive/master?filepath=fsharp%2FSamples%2FDataFrame-Getting%20Started.ipynb).

### Referencing the package


First, get the package and open the namespaces:

In [1]:
#i "nuget:https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet5/nuget/v3/index.json" 
#i "nuget:https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-tools/nuget/v3/index.json" 

#r "nuget:Microsoft.Data.Analysis,0.2.0"
#r "nuget: XPlot.Plotly.Interactive, 3.0.4"

open Microsoft.Data.Analysis

Installed package Microsoft.Data.Analysis version 0.2.0

Installed package XPlot.Plotly.Interactive version 3.0.4

Configuring PowerShell Kernel for XPlot.Plotly integration.

Installed support for XPlot.Plotly.

### Creating a data frame

Create 3 columns to hold values of types `DateTime`, `int`, and `string`

In [2]:
let dateTimes = PrimitiveDataFrameColumn<DateTime>("DateTimes") // Default length is 0.
let ints = PrimitiveDataFrameColumn<int>("Ints", 3L) // Makes a column of Length 3. Filles with nulls initially.
let strings = StringDataFrameColumn("Strings", 3L)

Add some datetimes

In [3]:
dateTimes.Append(DateTime.Parse("2019/01/01"))
dateTimes.Append(DateTime.Parse("2019/01/01"))
dateTimes.Append(DateTime.Parse("2019/01/02"))

Create a `DataFrame` with 3 columns

In [4]:
let df = DataFrame([dateTimes; ints; strings]: DataFrameColumn list)

### Adding better default formatting for data frames

Create a formatter for data frames and data frame rows.

In [5]:
module DateFrameFormatter = 
    
    // Locally open the F# HTML DSL.
    open Html

    let maxRows = 20

    Formatter.Register<DataFrame>((fun (context: FormatContext) (df: DataFrame) (writer: TextWriter) ->

        // Don't generate nested tables
        if context.ContentThreshold < 1.0 then false else

        // Ask other formatters to reduce information generation
        context.ReduceContent(0.2) |> ignore

        let take = 20
        table [] [
          thead [] [
            th [] [ str "Index" ]
            for c in df.Columns do
              th [] [ str c.Name]
          ]
          tbody [] [
            for i in 0 .. min maxRows (int df.Rows.Count - 1) do
              tr [] [
                td [] [ embed context i ]
                for o in df.Rows.[int64 i] do
                  td [] [ embed context o ]
              ]
          ]
        ]
        |> writer.Write

        true
    ), mimeType = "text/html")
    
    Formatter.Register<DataFrameRow>((fun (context: FormatContext) (row: DataFrameRow) (writer: TextWriter) ->

        // Don't generate nested tables
        if context.ContentThreshold < 1.0 then false else

        // Ask other formatters to reduce information generation
        context.ReduceContent(0.2) |> ignore

        table [] [
          tbody [] [
            tr [] [
              for o in row do
                td [] [ embed context o ] 
            ]
          ]
        ]
        |> writer.Write

        true
    ), mimeType = "text/html")
    

Now view the data frame: 

In [6]:
df

Index,DateTimes,Ints,Strings
0,2019-01-01 00:00:00Z,<null>,<null>
1,2019-01-01 00:00:00Z,<null>,<null>
2,2019-01-02 00:00:00Z,<null>,<null>


### Modifying data frames

Change a value directly through df:

In [7]:
df.[0L, 1] <- 10
df

Index,DateTimes,Ints,Strings
0,2019-01-01 00:00:00Z,10,<null>
1,2019-01-01 00:00:00Z,<null>,<null>
2,2019-01-02 00:00:00Z,<null>,<null>


We can also modify the values in the columns through indexers defined in `PrimitiveDataColumn` and `StringColumn`

In [8]:
ints.[1L] <- Nullable 100
strings.[1L] <- "Foo!"
df

Index,DateTimes,Ints,Strings
0,2019-01-01 00:00:00Z,10,<null>
1,2019-01-01 00:00:00Z,100,Foo!
2,2019-01-02 00:00:00Z,<null>,<null>


Check the data type

In [9]:
df.Info()

Index,Info,DateTimes,Ints,Strings
0,DataType,System.DateTime,System.Int32,System.String
1,Length (excluding null values),3,2,3


The `DataFrame` and the base `DataFrameColumn` class that all columns derive from expose a number of useful APIs: binary operations, computations, joins, merges, handling missing values and more.

In [10]:
df.["Ints"].Add(5, inPlace=true)
df

Index,DateTimes,Ints,Strings
0,2019-01-01 00:00:00Z,15,<null>
1,2019-01-01 00:00:00Z,105,Foo!
2,2019-01-02 00:00:00Z,<null>,<null>


In [11]:
df.["Ints"] <- (ints / 5) * 100
df

Index,DateTimes,Ints,Strings
0,2019-01-01 00:00:00Z,300,<null>
1,2019-01-01 00:00:00Z,2100,Foo!
2,2019-01-02 00:00:00Z,<null>,<null>


Let's `null` it up!

In [12]:
df.["Ints"].FillNulls(-1, inPlace=true)
df.["Strings"].FillNulls("Bar", inPlace=true)
df

Index,DateTimes,Ints,Strings
0,2019-01-01 00:00:00Z,300,Bar
1,2019-01-01 00:00:00Z,2100,Foo!
2,2019-01-02 00:00:00Z,-1,Bar


DataFrame exposes `Columns` property that we can enumerate over to access our columns. Here's how you can access the first row, though.

In [13]:
let row0 = df.Rows.[0L]
row0

0,1,2
2019-01-01 00:00:00Z,300,Bar


In [14]:
row0

0,1,2
2019-01-01 00:00:00Z,300,Bar


### Filtering and sorting data frames

Let's take a look at `Filter`, `Sort`, and `GroupBy`.

In [15]:
// Sort our dataframe using the Ints column
df.Sort("Ints", ascending=true)

Index,DateTimes,Ints,Strings
0,2019-01-02 00:00:00Z,-1,Bar
1,2019-01-01 00:00:00Z,300,Bar
2,2019-01-01 00:00:00Z,2100,Foo!


In [16]:
// GroupBy
let grouped = df.GroupBy("DateTimes")
// Count of values in each group
grouped.Count()

Index,DateTimes,Ints,Strings
0,2019-01-01 00:00:00Z,2,2
1,2019-01-02 00:00:00Z,1,1


In [17]:
let intGroupSum = grouped.Sum("Ints");
intGroupSum

Index,DateTimes,Ints
0,2019-01-01 00:00:00Z,2400
1,2019-01-02 00:00:00Z,-1


### Charting columns from data frames

In [18]:
open XPlot.Plotly
open System.Linq

In [19]:
#r "nuget:MathNet.Numerics"

Installed package MathNet.Numerics version 4.15.0

In [20]:
open MathNet.Numerics.Distributions

In [21]:
let mean = 0.0
let stdDev = 0.1

let normalDist = new Normal(mean, stdDev);

In [23]:
let doubles = PrimitiveDataFrameColumn<double>("Normal Distribution", normalDist.Samples().Take(1000));
// let ints = PrimitiveDataFrameColumn<int>("Ints", 3L) 
display(Chart.Plot(
    Graph.Histogram(x = doubles, nbinsx = 30)
));




