# Quickstart

In [3]:
import pandas as pd

from parquetranger import TableRepo


In [4]:
df = pd.DataFrame(
    {
        "A": [1, 2, 3, 4, 5, 6],
        "B": ["x", "y", "z", "x1", "x2", "x3"],
        "C": [1, 2, 1, 1, 1, 2],
        "C2": ["a", "a", "b", "a", "c", "c"],
    },
    index=["a1", "a2", "a3", "a4", "a5", "a6"],
)

In [5]:
df

Unnamed: 0,A,B,C,C2
a1,1,x,1,a
a2,2,y,2,a
a3,3,z,1,b
a4,4,x1,1,a
a5,5,x2,1,c
a6,6,x3,2,c


In [7]:
trepo = TableRepo("some_tmp_path", group_cols="C2")  # this creates the directory

In [8]:
trepo.extend(df)

In [9]:
trepo.get_full_df()

Unnamed: 0,A,B,C,C2
a1,1,x,1,a
a2,2,y,2,a
a4,4,x1,1,a
a3,3,z,1,b
a5,5,x2,1,c
a6,6,x3,2,c


In [10]:
df2 = pd.DataFrame(
    {
        "A": [21, 22, 23],
        "B": ["X", "Y", "Z"],
        "C": [10,20,1],
        "C2": ["a", "b", "a"],
    },
    index=["a1", "a4", "a7"]
    )

In [11]:
trepo.replace_records(df2)  # replaces based on index

In [12]:
trepo.get_full_df()

Unnamed: 0,A,B,C,C2
a2,2,y,2,a
a1,21,X,10,a
a7,23,Z,1,a
a3,3,z,1,b
a4,22,Y,20,b
a5,5,x2,1,c
a6,6,x3,2,c


In [13]:
trepo.replace_groups(df2)

In [15]:
trepo.get_full_df()  # replaced the whole groups where C2==a and C2==b with the records that were present in df2

Unnamed: 0,A,B,C,C2
a1,21,X,10,a
a7,23,Z,1,a
a4,22,Y,20,b
a5,5,x2,1,c
a6,6,x3,2,c


In [16]:
trepo.replace_all(df2)  # erases everything and puts df2 in. all traces of df are lost

In [17]:
trepo.get_full_df()

Unnamed: 0,A,B,C,C2
a1,21,X,10,a
a7,23,Z,1,a
a4,22,Y,20,b


In [19]:
trepo.replace_records(df, by_groups=True)  # replaces records based on index, but only looks for indices within groups, so this way duplicate a4 index is possible
# as they are in different groups, with different values in C2

In [20]:
trepo.get_full_df()

Unnamed: 0,A,B,C,C2
a7,23,Z,1,a
a1,1,x,1,a
a2,2,y,2,a
a4,4,x1,1,a
a4,22,Y,20,b
a3,3,z,1,b
a5,5,x2,1,c
a6,6,x3,2,c


In [21]:
trepo.purge()  # deletes everything