In [None]:
import os
from logging import getLogger
from pathlib import Path
from time import sleep

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from ase.io import iread
from colabfit.tools.vast.configuration import AtomicConfiguration
from colabfit.tools.vast.database import DataManager
from colabfit.tools.vast.property import PropertyInfo, PropertyMap
from dotenv import load_dotenv
from huggingface_hub import HfApi, delete_repo

load_dotenv()

logger = getLogger(__name__)

In [None]:
from colabfit.tools.vast.schema import config_md_schema
from colabfit.tools.vast.utils import spark_schema_to_arrow_schema

arrow_schema = spark_schema_to_arrow_schema(config_md_schema)

In [None]:
DATASET_NAME = "carbon-enantiomorphs"


def reader(xyz):
    # with open(csv_file, "r") as f:
    #     orig_ids = f.read().splitlines()
    for i, atoms in enumerate(iread(xyz, format="extxyz")):
        # atoms.info["original_id"] = orig_ids[i]
        atoms.info["_name"] = f"{DATASET_NAME}_index_{i}"
        yield AtomicConfiguration.from_ase(atoms)

### Train

In [None]:
gen_train = reader("data/train.xyz")

cos_train = [co.row_dict for co in gen_train]
for c in cos_train:
    c["structure_hash"] = str(c["structure_hash"])
cos_train[0]

In [None]:
co_table = pa.Table.from_pylist(cos_train, schema=arrow_schema)
co_table = co_table.select(
    [
        "id",
        "hash",
        "last_modified",
        "chemical_formula_hill",
        "chemical_formula_reduced",
        "chemical_formula_anonymous",
        "elements",
        "elements_ratios",
        "atomic_numbers",
        "nsites",
        "nelements",
        "nperiodic_dimensions",
        "cell",
        "dimension_types",
        "pbc",
        "names",
        "labels",
        "positions",
    ]
)
pq.write_table(
    co_table, "parquets/train.parquet", compression="ZSTD", compression_level=9
)

### Val

In [None]:
gen_val = reader("data/val.xyz")

cos_val = [co.row_dict for co in gen_val]
for c in cos_val:
    c["structure_hash"] = str(c["structure_hash"])
cos_val[0]

In [None]:
co_table = pa.Table.from_pylist(cos_val, schema=arrow_schema)
co_table = co_table.select(
    [
        "id",
        "hash",
        "last_modified",
        "chemical_formula_hill",
        "chemical_formula_reduced",
        "chemical_formula_anonymous",
        "elements",
        "elements_ratios",
        "atomic_numbers",
        "nsites",
        "nelements",
        "nperiodic_dimensions",
        "cell",
        "dimension_types",
        "pbc",
        "names",
        "labels",
        "positions",
    ]
)
pq.write_table(
    co_table, "parquets/val.parquet", compression="ZSTD", compression_level=9
)

#### Val to HF

In [None]:
token = os.getenv("HF_TOKEN")

api = HfApi(token=token)

In [None]:
api.create_repo(
    repo_id="colabfit/carbon-enantiomorphs", repo_type="dataset", token=token
)

In [None]:
# api.delete_file(
#     repo_id="colabfit/carbon-enantiomorphs",
#     repo_type="dataset",
#     token=token,
#     path_in_repo="test.parquet",
# )

In [None]:
api.upload_folder(
    folder_path="parquets",
    repo_type="dataset",
    repo_id="colabfit/carbon-enantiomorphs",
    token=token,
)

In [None]:
api.upload_file(
    path_or_fileobj="README.md",
    path_in_repo="README.md",
    repo_type="dataset",
    repo_id="colabfit/carbon-enantiomorphs",
    token=token,
)

In [None]:
api.delete_repo(repo_id="colabfit/carbon_chiral", repo_type="dataset", token=token)

### Get croissant files

In [None]:
import requests
import json

headers = {"Authorization": f"Bearer {token}"}


def query():
    response = requests.get(API_URL, headers=headers)
    return response.json()


for url in [
    "carbon-enantiomorphs",
]:
    API_URL = f"https://huggingface.co/api/datasets/colabfit/{url}/croissant"
    data = query()
    with open(f"{url}.json", "w") as f:
        json.dump(data, f)