In [None]:
import logging
import json
import gc
import pandas as pd
import requests as reqs
from impc_study.methods.utils import query

logging.basicConfig(level=logging.INFO)

# Import the path of each sub-package.
from impc_study.materials import MATERIALS_PATH
from impc_study.methods import METHODS_PATH
from impc_study.results import RESULTS_PATH

In [None]:
res = reqs.get("http://localhost:8983/solr/experiment/schema/fields")

fields = []
for field in res.json()["fields"]:
    if field['name'] not in ["metadata", "_version_"]:
        fields.append(field["name"])

fields

In [None]:
result = query(
    core="experiment",
    query={
        "query": "*:*",
        "limit": 0,
    },
)

num_found = result["response"]["numFound"]

num_found

In [None]:
dfs = []


def query_field(field: str):

    logging.info(f"field: {field}")

    result = query(
        core="experiment",
        query={
            "query": "*:*",
            "limit": 0,
            "facet": {
                "unique_value_count": f"unique({field})",
            },
        },
    )

    unique_value_count = result["facets"]["unique_value_count"]

    logging.info(f"unique_value_count: {unique_value_count}")

    if unique_value_count > 0 and unique_value_count < 100:

        result = query(
            core="experiment",
            query={
                "query": "*:*",
                "limit": 0,
                "facet": {
                    "categories": {
                        "type": "terms",
                        "field": field,
                        "limit": -1,
                    },
                },
            },
        )

        df = pd.DataFrame(result["facets"]["categories"]["buckets"])

        df["variable"] = field

        df["unique_value_count"] = unique_value_count

        df = df.rename({"val": "value", "count": "value_count"}, axis=1)

        df = df[["variable", "value", "value_count", "unique_value_count"]]

        return df.copy()

    elif unique_value_count > 0:

        df = pd.DataFrame(
            [{"variable": field, "unique_value_count": unique_value_count}]
        )

        return df.copy()


dfs = []

for field in fields:

    gc.collect()

    df = query_field(field)

    if not df is None:

        dfs.append(df)

In [None]:
df = pd.concat(dfs)

df