In [None]:
from google.colab import files
import zipfile
import os
import pandas as pd
import glob

def combine_csv_from_zip(zip_filename, extract_folder='unzipped_folder', output_filename='combined_embeddings.csv'):
    """
    Upload a zip file containing CSVs, extract, combine them vertically, and download the result, for SPO11, DMC1, MND1, MSH4, MSH5, REC8.

    Args:
        zip_filename (str): Name of the uploaded zip file (e.g., 'DMC1_CLS_Embeddings.zip').
        extract_folder (str): Folder to extract the zip into (default: 'unzipped_folder').
        output_filename (str): Name of the combined CSV file to save and download (default: 'combined_embeddings.csv').
    """
    import shutil

    # Clean up previous extract folder if exists
    if os.path.exists(extract_folder):
        shutil.rmtree(extract_folder)

    # Unzip
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)

    # Get the top-level folder (should match the ZIP)
    extracted_subfolders = os.listdir(extract_folder)
    extracted_subfolders = [f for f in extracted_subfolders if os.path.isdir(os.path.join(extract_folder, f))]

    if len(extracted_subfolders) != 1:
        print(f"⚠️ Warning: Multiple folders found in '{extract_folder}': {extracted_subfolders}")
        # Attempt to guess based on zip_filename
        folder_guess = os.path.splitext(zip_filename)[0]
        if folder_guess in extracted_subfolders:
            subfolder_path = os.path.join(extract_folder, folder_guess)
        else:
            subfolder_path = os.path.join(extract_folder, extracted_subfolders[0])
    else:
        subfolder_path = os.path.join(extract_folder, extracted_subfolders[0])

    print(f"Extracted folder: {subfolder_path}")

    # Find all CSVs
    csv_files = glob.glob(os.path.join(subfolder_path, '*.csv'))
    print(f"Found {len(csv_files)} CSV files.")

    # Combine CSVs
    dfs = []
    for file in csv_files:
        df = pd.read_csv(file)
        dfs.append(df)

    combined_df = pd.concat(dfs, axis=0, ignore_index=True)

    print(f"✅ Combined shape: {combined_df.shape}")

    # Save and download
    combined_df.to_csv(output_filename, index=False)
    files.download(output_filename)

    return combined_df



In [None]:
uploaded = files.upload()
df = combine_csv_from_zip('non-meiosis_cls_embedding.zip', output_filename='non-meiosis_cls_embedding.csv')

Saving non-meiosis_cls_embedding.zip to non-meiosis_cls_embedding (1).zip
Extracted folder: unzipped_folder/non-meiosis_cls_embedding
Found 54 CSV files.
✅ Combined shape: (2081, 641)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df = pd.read_csv("non-meiosis_combined_aa_features.csv")
missing_counts = df.isnull().sum()
print(missing_counts[missing_counts > 0].sort_values(ascending=False))

Series([], dtype: int64)


In [None]:
files.download("non-meiosis_combined_amino_acid_features.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>