Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Suggestion to distinct rows by specified columns #558

Open
ingted opened this issue Mar 31, 2023 · 2 comments
Open

Suggestion to distinct rows by specified columns #558

ingted opened this issue Mar 31, 2023 · 2 comments

Comments

@ingted
Copy link

ingted commented Mar 31, 2023

https://stackoverflow.com/questions/70985428/deedle-distinct-by-column/75897557#75897557

We sometimes do "select distinct col1, col2, col3 from xxx" in RDBMS, and it seems like we don't have an easy to use API in Deedle?

How about this?

#r "nuget: Deedle, 3.0.0"
#load "Deedle.fsx"
open Deedle
let inline distinctFrame (frame: Frame<'R, 'C>) (keys: 'C seq) (distColId:'C) =

    let idxSource = 
        frame
        |> Frame.mapRows (fun (i:'R) r ->
            i, keys |> Seq.map r.TryGet |> Seq.toArray
        )
        |> fun s -> s.Values

    let idx = 
        idxSource
        |> Seq.groupBy (fun (_, g) -> g)
        |> Seq.map (fun g -> 
            let (idx, _) = snd g |> Seq.item 0
            idx
            )
        |> Seq.distinct
        |> fun s -> Frame([distColId], [Series(s, s)])//"____distinctIdx____"

    let fmj = frame.Join(idx, kind=JoinKind.Inner)
    fmj.Columns[fmj.ColumnKeys |> Seq.filter (fun v -> v <> distColId)]



open System.IO


let data = "A;B\na;1\nb;2\nb;2\nc;3"

let bytes = System.Text.Encoding.UTF8.GetBytes data
let stream =  new MemoryStream( bytes )

let df= 
    Frame.ReadCsv(
        stream = stream,
        separators = ";",
        hasHeaders = true
    )


distinctFrame df ["A";"B"] "____distinctIdx____"
@ingted
Copy link
Author

ingted commented Mar 31, 2023

The result:

val data: string = "A;B
a;1
b;2
b;2
c;3"
val bytes: byte array =
  [|65uy; 59uy; 66uy; 10uy; 97uy; 59uy; 49uy; 10uy; 98uy; 59uy; 50uy; 10uy;
    98uy; 59uy; 50uy; 10uy; 99uy; 59uy; 51uy|]
val stream: IO.MemoryStream
val df: Frame<int,string> =
  
     A B 
0 -> a 1 
1 -> b 2 
2 -> b 2 
3 -> c 3 

4 rows x 2 columns
0 missing values
val it: Frame<int,string> =
  
     A B 
0 -> a 1 
1 -> b 2 
3 -> c 3 

3 rows x 2 columns
0 missing values

@ingted
Copy link
Author

ingted commented Dec 27, 2023

module Frame = 
    let inline distinctFrame (keys: 'C seq) (distColId:'C) (frame: Frame<'R, 'C>) =

        let idxSource = 
            frame
            |> Frame.mapRows (fun (i:'R) r ->
                i, keys |> Seq.map r.TryGet |> Seq.toArray
            )
            |> fun s -> s.Values

        let idx = 
            idxSource
            |> Seq.groupBy (fun (_, g) -> g)
            |> Seq.map (fun g -> 
                let (idx, _) = snd g |> Seq.item 0
                idx
                )
            |> Seq.distinct
            |> fun s -> Frame([distColId], [Series(s, s)])//"____distinctIdx____"

        let fmj = frame.Join(idx, kind=JoinKind.Inner)
        fmj.Columns.[fmj.ColumnKeys |> Seq.filter (fun v -> v <> distColId)]

    let inline orderBy (mappingOrderKeys: ObjectSeries<'C> -> int) (distColId:'C) (frame: Frame<'R, 'C>) =        
        let idxSource = 
            frame
            |> Frame.mapRows (fun (i:'R) r ->
                mappingOrderKeys r
            )
            |> fun s -> Frame([distColId], [s])//"____distinctIdx____"
        
        frame.Join(idxSource, kind=JoinKind.Inner)
        |> Frame.sortRows distColId


    let inline orderByCols (keys: 'C seq) (distColId:'C) (frame: Frame<'R, 'C>) =
        let colValues = 
            frame
            |> Frame.mapRowValues (fun row ->
                let values = 
                    keys
                    |> Seq.map (fun k ->
                        row.[k]
                    )
                    |> Seq.toArray
                values |> Array.map (fun v -> v :?> IComparable)
            )

        let orderKeys =
            colValues.Values
            |> Seq.distinct
            |> Seq.sort
            |> Seq.mapi (fun i v -> v, i)
            |> Map
           
        let ordered =
            colValues
            |> Series.map (fun c v -> orderKeys.[v])

        let appendCol =
            Frame.ofColumns [distColId, ordered]


        frame.Join(appendCol, kind=JoinKind.Inner)
        |> Frame.sortRows distColId

orderBy & orderByCols functionality

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant