## Enumerate `load_data_lookup` for EFS-ComStock

In [None]:
import os
import logging
import math
import pandas as pd

from dsgrid.utils.spark import init_spark
from dsgrid.utils.spark_partition import SparkPartition
from enumerate_load_table_lookup import EnumerateTable

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.debug("DSG")

### Initialize

In [None]:
# initialize spark
spark = init_spark("dsgrid-load")  # <--- same as a pyspark instance in shell

In [None]:
# initialize class func
partition = SparkPartition()
enumeration = EnumerateTable()

# define location to load_data_lookup
lookup_file = (
    "/Users/lliu2/Documents/dsGrid/dsgrid_v2.0.0/commercial/load_data_lookup.parquet"  # <---
)

In [None]:
# make a copy of the original data
relocated_file = enumeration.relocate_original_file(lookup_file)

In [None]:
# check load_data_lookup_orig has been created:
os.listdir(os.path.dirname(lookup_file))

### 1. Load data as a Spark df

In [None]:
### load data from relocated file
df_lookup = spark.read.parquet(relocated_file)

df_lookup.show()

### 2. Get keys to enumerate on

In [None]:
keys_to_exclude = ["scale_factor", "id"]
keys = [x for x in df_lookup.columns if x not in keys_to_exclude]
print(keys)

### 3. Expand Load_data_lookup to all combinations of keys, keep new combinations null

In [None]:
df_lookup_full = enumeration.enumerate_lookup_by_keys(df_lookup, keys)

df_lookup_full.show()

### 4. Data Check
#### 4.1. Mapping Report

In [None]:
enumeration.enumeration_report(df_lookup_full, df_lookup)

#### 4.2. Assertion checks

In [None]:
# 1) set of (data) id is the same before and after enumeration
# 2) make sure N_df_lookup_full is the product of the length of each key
enumeration.assertion_checks(df_lookup_full, df_lookup, keys)

### 5. Save

#### 5.1. check partitioning choices and *optimal* # of sharded files

In [None]:
df_lookup_full.printSchema()

In [None]:
df_lookup_full.rdd.getNumPartitions()

In [None]:
df_cols = df_lookup_full.columns

partition_stats = []
for key in df_cols:
    report = partition.file_size_if_partition_by(df_lookup_full, key)
    partition_stats.append(pd.DataFrame(report))

partition_stats = pd.concat(partition_stats, axis=1)

partition_stats

In [None]:
## *optimal* # of files
n_files = partition.get_optimal_number_of_files(df_lookup_full)
n_files

### Note:
- `write.partitionBy('col1','col2',...)`: export partitions by creating hierarchical subfolders (e.g., col1=0/col2=0/col3=.../part-0)
- `write.option("maxRecordsPerFile", n).partitionBy(col)`: use to control # of unique records (to n) per partition
- `coalesce(n).write`: combine into n partitions without shuffling, will not go larger than # of RDD files (spark default is 200)
- `repartition(n).write`: try to evenly distribute, if n > # of unique rows, some partitions will be empty
- `repartition(col).write`: create partitions by unique col field, 1 empty/very small partition will be created in addition to # of unique col records
- `repartition(n, col).write`: # files exported = min(n, # of unique fields for col)
- `repartition(n).write.partitionBy(col)`: create subfolder by unique col fields, each subfolder contains n partitions
- `write.partitionBy(col1).bucketBy(n_buckets, col2)`: distribute partitions into smaller pieces called buckets, col2 can not be the same as col1, good for reducing shuffles/exchanges when tables get joined, # of files exported = n_unique_fields_in_col1 x n_buckets x n_repartitions (if applicable)

File format: part-[partiton#]-[bucket#]...snappy.parquet


### Example:
`df_lookup_full.repartition(3).write \
    .partitionBy("sector") \
    .bucketBy(2, "subsector") \
    .mode("overwrite") \
    .option("path", lookup_file)\
    .saveAsTable("load_data_lookup", format='parquet')`
    
Outputs:

```
load_data_lookup.parquet
├── _SUCCESS
├── sector=com
│   ├── part-00000-4943b363-fbac-4665-8c76-d771c3f6cbbb_00000.c000.snappy.parquet
│   ├── part-00000-4943b363-fbac-4665-8c76-d771c3f6cbbb_00001.c000.snappy.parquet
│   ├── part-00001-4943b363-fbac-4665-8c76-d771c3f6cbbb_00000.c000.snappy.parquet
│   ├── part-00001-4943b363-fbac-4665-8c76-d771c3f6cbbb_00001.c000.snappy.parquet
│   ├── part-00002-4943b363-fbac-4665-8c76-d771c3f6cbbb_00000.c000.snappy.parquet
│   └── part-00002-4943b363-fbac-4665-8c76-d771c3f6cbbb_00001.c000.snappy.parquet
└── sector=res
    ├── part-00000-4943b363-fbac-4665-8c76-d771c3f6cbbb_00000.c000.snappy.parquet
    ├── part-00000-4943b363-fbac-4665-8c76-d771c3f6cbbb_00001.c000.snappy.parquet
    ├── part-00001-4943b363-fbac-4665-8c76-d771c3f6cbbb_00000.c000.snappy.parquet
    ├── part-00001-4943b363-fbac-4665-8c76-d771c3f6cbbb_00001.c000.snappy.parquet
    ├── part-00002-4943b363-fbac-4665-8c76-d771c3f6cbbb_00000.c000.snappy.parquet
    └── part-00002-4943b363-fbac-4665-8c76-d771c3f6cbbb_00001.c000.snappy.parquet

2 directories (controlled by `partitionBy`), 13 files
```

In [None]:
enumeration.save_file(df_lookup_full, lookup_file, n_files, repartition_by=None)