# Make a data ledger from code and data folders 

Here, we will make data descriptions (data url, name, description, etc.) from analyzing code and local data. 

In [1]:
from dataclasses import dataclass, field
from typing import Dict, Any, Tuple, List, Optional, Callable, Mapping, Optional
import os
import json
from functools import partial, cached_property
from collections import ChainMap

import dol
from lkj import clog
from tabled import pandas_json_dumps


@dataclass
class DataInfoDacc:
    data_folders: Dict[str, str]
    code_and_data_mapping: Dict[str, str]
    code_store: Dict[str, str]
    data_folder_name_filt: Callable[[str], bool] = (
        lambda x: not x.endswith(' alias') and not x.endswith('.parquet')
    )
    exclude_data_path_patterns: Tuple[str, ...] = ()
    max_data_path_levels: Optional[int] = None
    problematic_files: Tuple[str, ...] = ()
    output_json_dir: str = "data_folders_info"
    verbose: bool = True

    @cached_property
    def flat_data_folder_reader(self):
        return dol.FlatReader(
            {k: dol.Files(path) for k, path in self.data_folders.items()}
        )

    @cached_property
    def project_folders(self):
        pairs = {(x[0], x[1].split('/')[0]) for x in self.flat_data_folder_reader}
        return {project_name: project_group for project_group, project_name in pairs}

    @cached_property
    def data_folder_names(self) -> List[str]:
        _data_folder_names = {x[1].split('/')[0] for x in self.flat_data_folder_reader}
        return sorted(filter(self.data_folder_name_filt, _data_folder_names))

    def sizes_of_files_paths(self, filepaths: List[str]) -> Dict[str, int]:
        """
        Return a dictionary mapping each file path to its size (in bytes),
        sorted by size.
        """
        sizes = {fp: os.path.getsize(fp) for fp in filepaths}
        # Sort the dictionary by file size
        return dict(sorted(sizes.items(), key=lambda x: x[1]))

    def code_and_data_files(self, name: str) -> Dict[str, Any]:
        """
        Given a project name, determine the corresponding project group and data folder.
        Collect the list of file paths (and filenames) within that project folder.
        If there is an associated code file, include its name and contents.
        """
        import re

        exclude_pattern = re.compile('|'.join(self.exclude_data_path_patterns))

        # Determine the project group from the mapping
        project_group = self.project_folders[name]
        project_folderpath = os.path.join(self.data_folders[project_group], name)

        # Read the filenames in the project folder using dol.Files
        t = list(
            dol.filt_iter(
                dol.Files(project_folderpath, max_levels=self.max_data_path_levels),
                filt=lambda x: exclude_pattern.match(x) is None,
            ),
        )

        # Construct full file paths and get their sizes
        data_filepaths_with_sizes = self.sizes_of_files_paths(
            [os.path.join(project_folderpath, fn) for fn in t]
        )
        data_filenames = [
            os.path.basename(fp) for fp in data_filepaths_with_sizes.keys()
        ]

        result: Dict[str, Any] = {
            "project_group": project_group,
            "data_filenames": data_filenames,
            "data_filepaths": data_filepaths_with_sizes,
        }

        # If there is an associated code file, include its details
        code_file = self.code_and_data_mapping.get(name)
        if code_file:
            result.update(
                {
                    "code_file": code_file,
                    "code_contents": self.code_store.get(code_file),
                }
            )
        return result

    def table_info(self, filepath: str) -> Optional[Dict[str, Any]]:
        """
        Attempts to load the table at a given filepath using tabled.get_table.
        Returns a dictionary with the table shape and first row (if possible),
        or None if the file cannot be processed.
        """
        try:
            import tabled

            df = tabled.get_table(filepath)
            # Convert first row to dict if possible
            first_row = df.iloc[0]
            if hasattr(first_row, "to_dict"):
                first_row = first_row.to_dict()
            return {
                "shape": df.shape,
                "first_row": first_row,
            }
        except Exception:
            return None

    def gather_info_for_name(self, name: str) -> Dict[str, Any]:
        """
        For a given project name, gather detailed information including:
         - Data file paths and sizes
         - Code file and its content (if available)
         - The number of data files
         - Table information for each data file (unless it is in the problematic list)

        Uses the clog utility for logging based on the verbose flag.
        """
        _clog: Callable[..., None] = clog(self.verbose)
        _clog(f"Getting info for {name}")

        info = self.code_and_data_files(name)
        info['num_of_data_files'] = len(info['data_filepaths'])

        def tables_info_generator():
            for filepath in info['data_filepaths']:
                if filepath in self.problematic_files:
                    _clog("     ---> Skipping", filepath)
                    continue
                _clog("   Getting table info for", filepath)
                yield os.path.basename(filepath), self.table_info(filepath)

        info['tables_info'] = dict(tables_info_generator())
        return info

    @cached_property
    def project_info_store(self) -> dol.Store:
        """
        Constructs and returns a project info store using dol.
        The store wraps a JSON key-value store and uses the gather_info_for_name method
        as a missing key callback to compute project information on demand.
        """
        # Wrap a TextFiles-based store for JSON files with proper encoding/decoding
        JsonFiles = dol.wrap_kvs(
            dol.TextFiles,
            value_encoder=partial(pandas_json_dumps, indent=2),
            value_decoder=json.loads,
            key_codec=dol.KeyCodecs.suffixed(suffix='.json'),
        )

        json_store = dol.mk_dirs_if_missing(
            dol.cached_keys(
                JsonFiles(self.output_json_dir),
                keys_cache=tuple(self.data_folder_names),
            )
        )
        # json_store = dol.mk_dirs_if_missing(JsonFiles(self.output_json_dir))
        # Attach a missing-key callback that gathers info for a project name

        def gather_info_and_save_to_store(store, k):
            info = self.gather_info_for_name(k)
            store[k] = info
            return info

        return dol.add_missing_key_handling(
            json_store,
            missing_key_callback=gather_info_and_save_to_store,
        )


def code_store_from_code_folders(
    code_folders: Mapping, *, exclude_keys=(), extra_files: Mapping = ()
):
    PyStore = dol.Pipe(
        dol.TextFiles,
        dol.filt_iter(
            filt=lambda x: x.endswith('py')
            and not x.startswith('_')
            and x not in exclude_keys
        ),
    )

    code_store = ChainMap(*map(PyStore, code_folders.values()), dict(extra_files))
    return code_store

In [2]:

code_folders= {
    'imbed_data_prep': __import__('imbed_data_prep').__path__[0],
}
data_folders = {
    'imbed_saves': "/Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves",
    'figiri': "/Users/thorwhalen/Dropbox/_odata/figiri/"
}


code_and_data_mapping = {
    'eurovis': 'eurovis.py',
    'github_repos': 'github_repos.py',
    'hcp': 'hcp.py',
    'lmsys-chat-1m': 'lmsys_ai_conversations.py',
    'prompt-injections': 'prompt_injections.py',
    'twitter_sentiment': 'twitter_sentiment.py',
    'wildchat': 'wildchat.py',
    'wordnet_words': 'wordnet_words.py',
}

from pathlib import Path

exclude_keys = ['arxiv.py', 'ultra_chat.py', 'embeddings_of_aggregations.py']
extra_files = {
    'xv.data_access': Path(__import__('xv').data_access.__file__).read_text(),
}
code_store = code_store_from_code_folders(code_folders, exclude_keys=exclude_keys)

exclude_data_path_patterns = (
    r".*tmp\/.*",
    r".*embeddings_chunks\/.*",
    r".*flat_en_embeddings\/.*",
)
problematic_files = [
    "/Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/wildchat/embeddings.parquet",
]

list(code_store)

dacc = DataInfoDacc(
    data_folders, 
    code_and_data_mapping=code_and_data_mapping, 
    code_store=code_store, 
    exclude_data_path_patterns=exclude_data_path_patterns,
    problematic_files=problematic_files
)



In [3]:
list(dacc.project_info_store)

['eurovis',
 'github_repos',
 'harris_vs_trump',
 'hcp',
 'lmsys-chat-1m',
 'new_years_resolutions',
 'prompt-injections',
 'quotes',
 'spotify_playlists',
 'twitter_sentiment',
 'wildchat',
 'wordnet_words']

In [4]:
# This will go through all the keys, thereby computing all the project info
# and saving it to disk
for k, v in dacc.project_info_store.items():
    pass

Getting info for eurovis
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/eurovis/cluster_13_labels.parquet
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/eurovis/cluster_21_labels.parquet
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/eurovis/clusters_df.parquet
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/eurovis/planar_embeddings.parquet
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/eurovis/oa_embeddings_batch_keys.pkl
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/eurovis/raw_data.csv
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/eurovis/embeddable.parquet
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/eurovis/merged_artifacts.parquet
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/eurovis

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/lmsys-chat-1m/planar_embeddings_first_forth.npy
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/lmsys-chat-1m/pca500.pkl
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/lmsys-chat-1m/kmeans_7_clusters_indices.pkl
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/lmsys-chat-1m/kmeans_14_clusters_indices.pkl
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/lmsys-chat-1m/kmeans_28_clusters_indices.pkl
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/lmsys-chat-1m/dbscan_7_kmeans.pkl
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/lmsys-chat-1m/planar_embeddings_pca500.npy
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/lmsys-chat-1m/ncvis_planar_pca500_embeddings.npy
   Getting table info for /

  return reader(*args, **kwargs)


   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/lmsys-chat-1m/flat_en.parquet
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/lmsys-chat-1m/lmsys_with_planar_embeddings_pca500.parquet
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/lmsys-chat-1m/flat_en_embeddings_pca100.npy
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/lmsys-chat-1m/flat_en_conversation_grouped_embeddings.parquet
   Getting table info for /Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/lmsys-chat-1m/pca500_embeddings.npy
Getting info for new_years_resolutions
   Getting table info for /Users/thorwhalen/Dropbox/_odata/figiri/new_years_resolutions/new_years_resolutions_notebook.pdf
   Getting table info for /Users/thorwhalen/Dropbox/_odata/figiri/new_years_resolutions/with_hashtags/new_years_resolutions_prepped.parquet
   Getting table info for /Users/thorwhalen/Dropbox/_odata/figiri/ne

In [9]:
import json
import pathlib
import lkj 


t = dict(dacc.project_info_store)
tt = lkj.truncate_dict_values(t, max_list_size=6)
pathlib.Path('data_folders_info.json').write_text(json.dumps(tt, indent=2))

275301

In [None]:
import lkj

tt = lkj.truncate_dict_values(t, max_list_size=6)

len(json.dumps(t, indent=2)), len(json.dumps(tt, indent=2))


(859488, 275301)

In [None]:
# TODO: Move this to lkj or context making package

from typing import Callable

def truncate_big_dicts(d, max_length_of_dict=6, yield_condition: Callable = lambda x: True, *, middle_element={'...': '...'}):
    """
    Recursively traverse the dictionary and replace any "big" dictionaries with a truncated version.
    
    A dictionary is considered "big" if its length exceeds max_length_of_dict and yield_condition(dict) returns True.
    For such dictionaries, only the first (max_length_of_dict - 1) entries and the last entry are kept.
    A middle_element (a dictionary) is inserted between these entries to indicate omitted content.
    
    If yield_condition returns False for a given dictionary, its contents are recursed normally without truncation.
    
    Parameters:
      d (dict): The dictionary to process.
      max_length_of_dict (int): The maximum allowed number of entries in a dictionary before it is truncated.
      yield_condition (Callable): A function that takes a dictionary and returns True if it should be truncated,
                                  and False if it should be left as is.
      middle_element (dict): A dictionary to insert in the middle of the truncated dictionary, to signal omissions.
    
    Returns:
      A new dictionary with the same structure as d, with any "big" dictionaries truncated according to the rule.
    """
    # If the input is not a dictionary, return it unchanged.
    if not isinstance(d, dict):
        return d

    # First, recursively process all values.
    processed = {key: truncate_big_dicts(value, max_length_of_dict, yield_condition, middle_element=middle_element)
                 for key, value in d.items()}
    
    # Check if the current dictionary is "big" and meets the condition to be truncated.
    if len(processed) > max_length_of_dict and yield_condition(processed):
        # Convert items to a list to preserve order.
        items = list(processed.items())
        truncated = {}
        # Retain the first (max_length_of_dict - 1) key-value pairs.
        for key, value in items[:max_length_of_dict - 1]:
            truncated[key] = value
        # Insert the middle element to indicate omitted entries.
        truncated.update(middle_element)
        # Append the last key-value pair.
        last_key, last_value = items[-1]
        truncated[last_key] = last_value
        return truncated
    else:
        return processed

def values_are_all_numerical(d):
    return all(isinstance(v, (int, float)) for v in d.values())

ttt = truncate_big_dicts(tt, yield_condition=values_are_all_numerical)

pathlib.Path('data_folders_info.json').write_text(json.dumps(ttt, indent=2))



84309

In [21]:
# go through a dict recursively and find and yield dicts that are more than 1000 items long
def find_big_dicts(d, max_length=1000, path=()):
    if not isinstance(d, dict):
        return
    if len(d) > max_length:
        yield path, d
    for k, v in d.items():
        yield from find_big_dicts(v, max_length, path=path + (k,))

k, w = next(find_big_dicts(ttt))
k

('new_years_resolutions',
 'tables_info',
 'openai_topics_embeddings.parquet',
 'first_row')

# Extracting base urls from code

Make a function that will be given code (string) and figure out where the raw (seed) data is downloaded from.

Yeah, a pretty hard NLP problem. But now we have LLMs, so...

## Making the extractor

In [14]:
import oa

refresh_json_schema = False


schema_description = """
    The output should include:
    * "url" field that points to the raw data source if there is only one source
    * "urls" (json object) field that is used when there are multiple sources for 
        various kinds of data is a json object where the keys are the names of the 
        data sources and the values are the urls
    * "url_type": (string) field that indicates the type of the url (e.g. "http", "file", "env_var", "config_key")
    * "name" (string) field that is a short name for the data set,
    * "description" (string) field that is a longer description of the data set
    * "parameters" (json object whose fields are names and values are descriptions) field
        that describe the different parameters that are used in the data preperation process 
    * "data_keys" (json object) fields which should indicate what kind of data artifacts 
        are created, or used in the data preperation process. This is often in the form of a 
        string that refers to a relative path, or other identifier.
    * "functions" (jso object) field containing python functions or methods that are 
        created in the data preperation process.
    """

if refresh_json_schema:
    print('Recomputing a json schema')
    from pprint import pprint

    json_schema = oa.tools.infer_schema_from_verbal_description(schema_description)

    print("--------------------")
    print(
        "Make sure to copy the following json schema into the code defining json_schema if you want to reuse it!!!"
    )
    print("--------------------")
    pprint(json_schema)


else:
    print('using cached json schema')
    json_schema = {
        'name': 'data_prep_code_analysis',
        'properties': {
            'name': {'type': 'string'},
            'url': {'type': 'string'},
            'url_type': {
                'enum': ['http', 'file', 'env_var', 'config_key'],
                'type': 'string',
            },
            'urls': {'additionalProperties': {'type': 'string'}, 'type': 'object'},
            'data_keys': {
                'additionalProperties': {'type': 'string'},
                'type': 'object',
            },
            'description': {'type': 'string'},
            'functions': {
                'additionalProperties': {'type': 'string'},
                'type': 'object',
            },
            'parameters': {
                'additionalProperties': {'type': 'string'},
                'type': 'object',
            },
        },
        'required': [
            'name',
            'url',
            'urls',
            'url_type',
            'description',
            'parameters',
            'data_keys',
            'functions',
        ],
        'type': 'object',
    }

analyze_data_prep_code = oa.prompt_json_function(
    f"""
    This python code should (but not necessarily) contain some data preperation code 
    that downloads some raw data and then processes it into a form that is ready for analysis.

    You should study the code and make a json object that describes the main "data feature" 
    that are therein: The data source, the data preparation artifacts, and the main 
    python functions, methods, or constants that the code creates.
    Note that sometimes the "url" or "urls" may not be actual http(s) urls, but could 
    also be filepaths, or the name(s) of environmental variables or configuration values
    (often in all caps) that are used to locate the url(s) of the data source(s).
    In this case, you should still use the "url" or "urls" fields, but the values should
    be strings that are not valid urls, but rather the name of the filepath, 
    the environmental variable or configuration key.

    Usually, if there is an http(s) url in the code, it is the main data source.

    Note though, that some inputs that are given may not be actual data prep code.

    In this case, you should just return an empty string for url and empty json objects 
    for urls, and in the description, you should indicate that this is not data prep code, 
    perhaps explaining why you think so. In this case start the description with 
    "NOT DATA PREP CODE", followed with your analysis/explication.

    {schema_description}

    {{python_code}}
    """,
    json_schema=json_schema,
)

using cached json schema


## Gathering the code files we'll use

In [15]:
def if_string_json_decode_it(x):
    if isinstance(x, str):
        import json 
        try:
            return json.loads(x)
        except:
            pass
    return x

def get_analysis_dict(code_key, py_file_contents):
    """Use this to extend the code_folders_analysis by doing
    code_folders_analysis.extend(get_analysis_dict(code_key, py_file_contents))
    or to replace an entry by doing
    code_folders_analysis[i] = get_analysis_dict(code_key, py_file_contents)
    """
    d = analyze_data_prep_code(py_file_contents)
    if 'result' in d:
        d['result'] = if_string_json_decode_it(d['result'])
    d['code'] = code_key
    return d


In [16]:
import dol 
from collections import ChainMap
from pathlib import Path

exclude = {'arxiv.py', 'ultra_chat.py', 'embeddings_of_aggregations.py'}
PyStore = dol.Pipe(
    dol.TextFiles, 
    dol.filt_iter(
        filt=lambda x: x.endswith('py') and not x.startswith('_') and x not in exclude
    ),
)

s = PyStore(__import__('imbed_data_prep').__path__[0])
extra_files = {
    'xv.data_access': Path(__import__('xv').data_access.__file__).read_text(),
}

code_store = ChainMap(
    s,
    extra_files
)
list(code_store)

['xv.data_access',
 'prompt_injections.py',
 'wordnet_words.py',
 'hcp.py',
 'jersey_laws.py',
 'github_repos.py',
 'wildchat.py',
 'eurovis.py',
 'lmsys_ai_conversations.py',
 'twitter_sentiment.py']

In [17]:
refresh_code_folders_analysis = False
code_folders_analysis_save_path = 'code_folders_analysis.json'

In [18]:
import json 

if refresh_code_folders_analysis:
    
    def analyze_data_prep_code_of_folder(code_store, verbose=True):
        for code_key, py_file_contents in code_store.items():
            if verbose:
                print(f"  Analyzing {code_key}")
            yield get_analysis_dict(code_key, py_file_contents)
            
    code_folders_analysis = list(analyze_data_prep_code_of_folder(code_store))

    # sometimes the AI didn't manage to catch the urls, so we can manually add them here

    backup_urls = {
        'hcp.py': {
            'url': 'HCP_PUBS_SRC_KEY',
            'url_type': 'env_var',
        },
        'eurovis.py': {
            'url': 'https://81593031860c2ee4ad53a08892f7e95d.r2.cloudflarestorage.com/cosmograph/projects/eurovis/raw_data.csv',
            'url_type': 'http',
            # 'filepath': '/Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/eurovis/raw_data.csv'
        },
        # 'embeddings_of_aggregations.py': {"NOT A DATA PREP MODULE"}
    }


    for d in code_folders_analysis:
        if not d.get('url') and not d.get('urls'):
            if d['code'] in backup_urls:
                d.update(backup_urls[d['code']])

    json.dump(code_folders_analysis, open(code_folders_analysis_save_path, 'w'))

else:
    code_folders_analysis = json.load(open(code_folders_analysis_save_path))


print(f"{len(code_folders_analysis)} code folders analyzed")
[d['code'] for d in code_folders_analysis]


10 code folders analyzed


['xv.data_access',
 'prompt_injections.py',
 'wordnet_words.py',
 'hcp.py',
 'jersey_laws.py',
 'github_repos.py',
 'wildchat.py',
 'eurovis.py',
 'lmsys_ai_conversations.py',
 'twitter_sentiment.py']

In [19]:
len(code_folders_analysis)

10

In [20]:
code_folders_analysis_by_key = {d['code']: d for d in code_folders_analysis}
code_analysis_keys = list(code_folders_analysis_by_key)
code_analysis_keys

['xv.data_access',
 'prompt_injections.py',
 'wordnet_words.py',
 'hcp.py',
 'jersey_laws.py',
 'github_repos.py',
 'wildchat.py',
 'eurovis.py',
 'lmsys_ai_conversations.py',
 'twitter_sentiment.py']

# Extracting information from the folders

In [21]:
data_folders = {
    'imbed_saves': "/Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves",
    'figiri': "/Users/thorwhalen/Dropbox/_odata/figiri/"
}


## code_and_data_mapping

In [22]:
import dol 

t = dol.FlatReader({k: dol.Files(path) for k, path in data_folders.items()})

pairs = {(x[0], x[1].split('/')[0]) for x in t}
data_folder_names = {x[1].split('/')[0] for x in t}
data_folder_names = sorted(
    filter(lambda x: not x.endswith(' alias') and not x.endswith('.parquet'), data_folder_names)
)
project_folders = {project_name: project_group for project_group, project_name in pairs}
data_folder_names

['eurovis',
 'github_repos',
 'harris_vs_trump',
 'hcp',
 'lmsys-chat-1m',
 'new_years_resolutions',
 'prompt-injections',
 'quotes',
 'spotify_playlists',
 'twitter_sentiment',
 'wildchat',
 'wordnet_words']

In [23]:
refresh_code_and_data_matching = False

if refresh_code_and_data_matching:
    import oa

    f = oa.prompt_json_function(
        """
    Here's a list of data prep code files and a list of data folders.
                                
    Give me a json object that maps the data folders to the relevant code file
    that they use. Don't include those pairs that do not match.

    data_folders: {data_folders}    
    code_files: {code_files}
    """,
        # json_schema="the json schema could be just a json object with keys that are the "
        # "data folder names and values that are the code files",
    )

    code_and_data_mapping = f(data_folder_names, code_files=code_analysis_keys)['result']
else:
    code_and_data_mapping = {
        'eurovis': 'eurovis.py',
        'github_repos': 'github_repos.py',
        'hcp': 'hcp.py',
        'lmsys-chat-1m': 'lmsys_ai_conversations.py',
        'prompt-injections': 'prompt_injections.py',
        'twitter_sentiment': 'twitter_sentiment.py',
        'wildchat': 'wildchat.py',
        'wordnet_words': 'wordnet_words.py',
    }

code_and_data_mapping

{'eurovis': 'eurovis.py',
 'github_repos': 'github_repos.py',
 'hcp': 'hcp.py',
 'lmsys-chat-1m': 'lmsys_ai_conversations.py',
 'prompt-injections': 'prompt_injections.py',
 'twitter_sentiment': 'twitter_sentiment.py',
 'wildchat': 'wildchat.py',
 'wordnet_words': 'wordnet_words.py'}

In [24]:
_remaining_code_names = sorted(set(code_analysis_keys) - set(code_and_data_mapping.values()))
_remaining_data_folders = sorted(set(data_folder_names) - set(code_and_data_mapping.keys()))
_remaining_data_folders, _remaining_code_names

(['harris_vs_trump', 'new_years_resolutions', 'quotes', 'spotify_playlists'],
 ['jersey_laws.py', 'xv.data_access'])

In [25]:
# just to test
assert project_folders['wildchat'] == 'imbed_saves'

## gather information for each project

In [26]:
import dol 
import os 

def code_and_data_files(name):
    project_group = project_folders[name]
    project_folderpath = os.path.join(data_folders[project_group], name)
    t = dol.filt_iter(
        dol.Files(project_folderpath, max_levels=0),
    )
    data_filepaths = [os.path.join(project_folderpath, fn) for fn in t]
    data_filepaths = sizes_of_files_paths(data_filepaths)
    data_filenames = list(map(os.path.basename, data_filepaths))

    d = dict(
        project_group=project_group, 
        data_filenames=data_filenames,
        data_filepaths=data_filepaths
    )

    code_file = code_and_data_mapping.get(name, None)
    if code_file:
        d.update(code_file=code_file, code_contents=code_store[code_file])
        
    return d

def sizes_of_files_paths(filepaths):
    import os 
    sizes = {fp: os.path.getsize(fp) for fp in filepaths}
    # sort by size
    sizes = dict(sorted(sizes.items(), key=lambda x: x[1]))
    return sizes

# test
t = code_and_data_files('hcp')
list(t)


['project_group',
 'data_filenames',
 'data_filepaths',
 'code_file',
 'code_contents']

In [27]:
def table_info(filepath):
    import tabled

    try:
        df = tabled.get_table(filepath)
        return dict(
            shape=df.shape,
            first_row=df.iloc[0],
        )
    except:
        return None


problematic_files = tuple(
    [
        '/Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/wildchat/embeddings.parquet',
    ]
)


def gather_info_for_name(name, skip=problematic_files, verbose=True):
    _clog = clog(verbose)
    _clog(f"Getting info for {name}")
    d = code_and_data_files(name)
    d['num_of_data_files'] = len(d['data_filepaths'])

    def tables_info():
        for filepath in d['data_filepaths']:
            if filepath in skip:
                _clog("     ---> Skipping", filepath)
                continue
            _clog("   Getting table info for", filepath)
            yield os.path.basename(filepath), table_info(filepath)

    d['tables_info'] = dict(tables_info())
    return d


from tabled import pandas_json_dumps
import dol
from functools import partial
from lkj import clog


JsonFiles = dol.wrap_kvs(
    dol.TextFiles, 
    value_encoder=partial(pandas_json_dumps, indent=2), 
    value_decoder=json.loads,
    key_codec=dol.KeyCodecs.suffixed(suffix='.json')
)

# json_store = dol.mk_dirs_if_missing(JsonFiles('data_folders_info'))
project_info_store = dol.add_missing_key_handling(
    dol.mk_dirs_if_missing(JsonFiles('data_folders_info')),
    missing_key_callback=lambda store, k: gather_info_for_name(k),
)

    

  warn(


# Refactor to Dacc

In [41]:
s = dacc.project_info_store()

In [42]:
list(s)

['spotify_playlists',
 'hcp',
 'prompt-injections',
 'twitter_sentiment',
 'new_years_resolutions',
 'quotes',
 'wordnet_words',
 'eurovis',
 'harris_vs_trump',
 'wildchat',
 'lmsys-chat-1m',
 'github_repos']

# Scrap stores

In [None]:
import dol
import tabled
import re


exclude_patterns = [
    '.*flat_en_embeddings.*',  # folder containing chunks of data
    'hcp/tmp/.*',  # folder containing chunks of data
    '.*.pdf',  # pdf files
    '.*.png',  # png files
    '.*.mov',  # mov files
    '.*\ aliased',  # aliased folders

]

exclude_filt = dol.filt_iter(
    filt=lambda x: not any(re.match(p, x) for p in exclude_patterns)
)

exclude_for_tables = [
    '.*.zip'
]
exclude_for_tables_filt = dol.filt_iter(
    filt=lambda x: not any(re.match(p, x) for p in exclude_for_tables)
)


list(s)

['eurovis/embeddings_df.parquet',
 'eurovis/planar_embeddings.parquet',
 'eurovis/clusters_df.parquet',
 'eurovis/cluster_21_labels.parquet',
 'eurovis/merged_artifacts.parquet',
 'eurovis/embeddable.parquet',
 'eurovis/raw_data.csv',
 'eurovis/cluster_13_labels.parquet',
 'eurovis/oa_embeddings_batch_keys.pkl',
 'lmsys-chat-1m/flat_en_conversation_grouped_embeddings.parquet',
 'lmsys-chat-1m/planar_embeddings_for_a_forth_of_data.parquet',
 'lmsys-chat-1m/planar_embeddings_for_a_forth_of_data.tsv.zip',
 'lmsys-chat-1m/Icon\r',
 'lmsys-chat-1m/planar_embeddings_grouped.tsv',
 'lmsys-chat-1m/planar_embeddings_for_a_forth_of_data.tsv',
 'lmsys-chat-1m/pca500.pkl',
 'lmsys-chat-1m/planar_embeddings_grouped.pkl',
 'lmsys-chat-1m/pca500_embeddings.npy',
 'lmsys-chat-1m/kmeans_7_clusters_indices.pkl',
 'lmsys-chat-1m/planar_embeddings_pca500.npy',
 'lmsys-chat-1m/lmsys_with_planar_embeddings_pca500.parquet',
 'lmsys-chat-1m/planar_embeddings_of_grouped_conversations_with_metadata.tsv.zip',
 '

## Diagnosis and WIP

### Looking through the code_folders_analysis

Here, we're going to look through the `code_folders_analysis` to see if everything is fine.

(When it's not, we address it and include the logic above so we get it all at once good.)

In [41]:
def get_entries_with_empty_url_or_urls(code_folders_analysis):
    return [i for i, d in enumerate(code_folders_analysis) if not d['url'] and not d['urls']]

t = get_entries_with_empty_url_or_urls(code_folders_analysis)
print(t)
[code_folders_analysis[i]['code'] for i in t]

[]


[]

In [None]:
import imbed_data_prep.eurovis

known_urls = {
    'hcp.py': {
        'url': 'HCP_PUBS_SRC_KEY',
        'url_type': 'env_var',
    },
    'eurovis.py': {
        'url': 'https://81593031860c2ee4ad53a08892f7e95d.r2.cloudflarestorage.com/cosmograph/projects/eurovis/raw_data.csv',
        'url_type': 'http',
        'filepath': '/Users/thorwhalen/Dropbox/_odata/app_data/imbed/saves/eurovis/raw_data.csv'
    },
    # 'embeddings_of_aggregations.py': {"NOT A DATA PREP MODULE"}
}

In [19]:
i = 0

In [23]:
import json
from pprint import pprint

print(f"--- {i=} -----")
t = code_folders_analysis[i]
if isinstance(t, str):
    import json 
    json = json.loads(t)
pprint(t)
i = (i + 1) % len(code_folders_analysis)

--- i=3 -----
{'code': 'hcp.py',
 'data_keys': {'aggregate_titles_embeddings_key': 'Filepath for aggregated '
                                                  'titles embeddings.',
               'citations_src_key': 'Filepath for citations data.',
               'embeddings_src_key': 'Filepath for embedding data.',
               'info_src_key': 'Filepath for additional publication info.',
               'src_key': 'Path to source key based on local config or '
                          'environment variable.'},
 'description': 'NOT DATA PREP CODE: This code primarily defines a class to '
                'handle various operations on HCP (Human Connectome Project) '
                'publications data and their citations. It retrieves and '
                'processes embeddings and citation data, but it does not '
                'include explicit downloading or acquiring of raw data from an '
                'external source. The only potential environment variable '
                