## Create dataframes for join

There are some notes regarding design for teh R script https://github.com/h2oai/db-benchmark/issues/106

Originally these are the args you can pass to the R script to create the data:

```R
N=as.numeric(args[1L]); K=as.integer(args[2L]); nas=as.integer(args[3L]); sort=as.integer(args[4L])
```

K -  does not have any effect in the whole script. 
nas - Number of NaN. We will ignore case with NaNs for now. 
sort - We will work with unsorted data. 

For us, N is the only relevant number. 

In [1]:
import numpy as np
import pandas as pd

In [2]:
def split_xlr(n):
    # split into common (0.9) left (0.1) and right (0.1)
    
    n = int(n)
    key = np.random.choice(np.arange(1, int(n*1.1) + 1), size=int(n*1.1), replace=False)
    x = key[np.arange(0, int(n*0.9))]
    l = key[np.arange(int(n*0.9), n)]
    r = key[np.arange(n, int(n*1.1))]
    return {"x": x, "l": l, "r": r}

In [3]:
def sample_all(x, n_rows):
    n_rows = int(n_rows)
    assert(len(x) <= n_rows)
    
    y = np.append(
        x,
        np.random.choice(x, size=max(n_rows-len(x), 0), replace=True)
    )
    return np.random.choice(y, size=len(y), replace=False)

In [4]:
def add_str_cols(col):
    new_col = [f"id{row:.0f}" for row in col]
    return new_col

In [5]:
def pretty_num(num):
    return ''.join(f"{num:.0E}".split("+0"))

In [6]:
#generate keys
def generate_keys(N):
    
    key1 = split_xlr(N/1e6) 
    key2 = split_xlr(N/1e3) 
    key3 = split_xlr(N)
    
    return key1, key2, key3

In [8]:
def create_lhs(N, key1, key2, key3, dir):
    
    N = int(N)
    
    id1 = sample_all(np.append(key1['x'], key1['l']), N)
    id2 = sample_all(np.append(key2['x'], key2['l']), N)
    id3 = sample_all(np.append(key3['x'], key3['l']), N)
    
    id4 = add_str_cols(id1)
    id5 = add_str_cols(id2)
    id6 = add_str_cols(id3)
    
    v1 = np.around(np.random.uniform(0, 100, size=N), decimals=6)
    
    df = pd.DataFrame(
        dict(
            zip(
                [f"id{x}" for x in range(1, 7)] + ["v1"],
                [id1, id2, id3, id4, id5, id6, v1],
            )
        )
    )
    
#     df.to_csv(
#         f"{dir}/join-lhs-N_{pretty_num(N)}.csv",
#         index=False,
#     )
    df.to_parquet(
        f"{dir}/join-lhs-N_{pretty_num(N)}.parquet",
        index=False,
    )

In [9]:
def create_rhs_small(N, key1, dir):
    
    n = int(N/1e6)
    
    id1 = sample_all(np.append(key1['x'], key1['r']), n)
    
    id4 = add_str_cols(id1)
    
    v2 = np.around(np.random.uniform(0, 100, size=n), decimals=6)
    
    df = pd.DataFrame(
        dict(
            zip(
                ["id1", "id4"] + ["v2"],
                [id1, id4, v2],
            )
        )
    )
    
    df.to_parquet(
        f"{dir}/join-rhs-small-N_{pretty_num(N)}-n_{pretty_num(n)}.parquet",
        index=False,
    )

In [10]:
def create_rhs_medium(N, key1, key2, dir):
    
    n = int(N/1e3)
    
    id1 = sample_all(np.append(key1['x'], key1['r']), n)
    id2 = sample_all(np.append(key2['x'], key2['r']), n)
    
    id4 = add_str_cols(id1)
    id5 = add_str_cols(id2)
    
    v2 = np.around(np.random.uniform(0, 100, size=n), decimals=6)
    
    df = pd.DataFrame(
        dict(
            zip(
                ["id1", "id2", "id4", "id5"] + ["v2"],
                [id1, id2, id4, id5,  v2],
            )
        )
    )
    
    df.to_parquet(
        f"{dir}/join-rhs-medium-N_{pretty_num(N)}-n_{pretty_num(n)}.parquet",
        index=False,
    )


In [16]:
def create_rhs_big(N, key1, key2, key3, dir):
    
    n = int(N/1e0)
    
    id1 = sample_all(np.append(key1['x'], key1['r']), n)
    id2 = sample_all(np.append(key2['x'], key2['r']), n)
    id3 = sample_all(np.append(key3['x'], key3['r']), n)
    
    id4 = add_str_cols(id1)
    id5 = add_str_cols(id2)
    id6 = add_str_cols(id3)
    
    v2 = np.around(np.random.uniform(0, 100, size=n), decimals=6)
    
    df = pd.DataFrame(
        dict(
            zip(
                [f"id{x}" for x in range(1, 7)] + ["v2"],
                [id1, id2, id3, id4, id5, id6, v2],
            )
        )
    )
    
    df.to_parquet(
        f"{dir}/join-rhs-big-N_{pretty_num(N)}-n_{pretty_num(n)}.parquet",
        index=False,
    )

In [12]:
dir = "../test_join_data"

In [13]:
N = 1e7

In [14]:
key1, key2, key3 = generate_keys(N)

In [15]:
create_lhs(N, key1, key2, key3, dir)

In [15]:
create_rhs_small(N, key1, dir)

In [17]:
create_rhs_medium(N, key1, key2, dir)

In [17]:
create_rhs_big(N, key1, key2, key3, dir)

In [21]:
#test = pd.read_parquet("../test_join_data/join-rhs-big-N_1E7-n_1E7.parquet")

In [25]:
#test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 7 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id1     int64  
 1   id2     int64  
 2   id3     int64  
 3   id4     object 
 4   id5     object 
 5   id6     object 
 6   v2      float64
dtypes: float64(1), int64(3), object(3)
memory usage: 534.1+ MB
