In [None]:
from lmf import db
from pathlib import Path

In [None]:
ec_galrep = db["ec_galrep"]

In [None]:
def get_galrep(label):
    return list(ec_galrep.search({"lmfdb_label": label}))

In [None]:
raw_dir = Path("lmfdb-data") #lmfdb-data in txt file retrieved
# from lmfdb, with default columns
out_dir = Path("adelic")
out_dir.mkdir(parents=True, exist_ok=True)

In [None]:
for fp in sorted(raw_dir.glob("*")):          # use "*.txt" if you only want txt
    if not fp.is_file():
        continue

    # write output for this input file, same filename under adelic/
    out_fp = out_dir / fp.name
    with out_fp.open("w", encoding="utf-8") as out:
        
        # read one input file, build its adelic_list
        labels, coeffs, levels = [], [], []
        
        # Try multiple encodings, fallback to binary with error handling
        encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
        file_content = None
        encoding_used = None
        
        for encoding in encodings_to_try:
            try:
                with fp.open("r", encoding=encoding) as f:
                    file_content = f.readlines()
                    encoding_used = encoding
                    break
            except UnicodeDecodeError:
                continue
        
        if file_content is None:
            print(f"Could not decode {fp.name} with any common encoding, skipping...")
            continue
        
        print(f"Successfully read {fp.name} using {encoding_used} encoding")
        
        for line in file_content:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            parts = line.split("\t")
            if len(parts) < 8:  # Ensure we have enough parts
                continue
            label = parts[0].strip('"')
            level = parts[6].strip('"')
            coefficient = parts[-1].strip('"')
            labels.append(label)
            coeffs.append(coefficient)
            levels.append(level)

        # Process each label and write immediately
        for i, label in enumerate(labels):
            recs = get_galrep(label)
            if not recs:
                continue
            galrep = recs[0]
            adelic_gens = galrep.get("adelic_gens", None)
            if adelic_gens is None:
                continue
            
            # Write this record immediately
            gens_str = str(adelic_gens).replace(" ", "")
            out.write(f"{label};{coeffs[i]};{gens_str};{levels[i]}\n")
            out.flush()  # Ensure it's written to disk immediately