In [None]:
from google.colab import files
import zipfile
import os
import pandas as pd
import glob
import shutil

def combine_csv_from_zip(zip_filename,
                         extract_folder='unzipped_folder',
                         output_filename='combined_embeddings.csv',
                         taxa_map=None):
    """
    Upload a zip file containing CSVs, extract, combine them vertically with taxa annotations, and download the result.

    Args:
        zip_filename (str): Name of the uploaded zip file (e.g., 'DMC1_CLS_Embeddings.zip').
        extract_folder (str): Folder to extract the zip into (default: 'unzipped_folder').
        output_filename (str): Name of the combined CSV file to save and download (default: 'combined_embeddings.csv').
        taxa_map (dict): A mapping of filename substrings to taxa values.
                         Example: {'cleaned_DMC1_A_CLS_embeddings': 'arthropods', ...}
    """

    # Default taxa_map if not provided
    if taxa_map is None:
        taxa_map = {
            'cleaned_DMC1_A_CLS_embeddings': 'arthropods',
            'cleaned_DMC1_C_CLS_embeddings': 'chordates',
            'cleaned_DMC1_NOTCA_CLS_embeddings': 'other animals',
            'cleaned_DMC1_fungi_CLS_embeddings': 'fungi',
            'cleaned_DMC1_plant_CLS_embeddings': 'plants'
        }

    # Clean up previous extract folder if exists
    if os.path.exists(extract_folder):
        shutil.rmtree(extract_folder)

    # Unzip
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)

    # Get the top-level folder (should match the ZIP)
    extracted_subfolders = os.listdir(extract_folder)
    extracted_subfolders = [f for f in extracted_subfolders if os.path.isdir(os.path.join(extract_folder, f))]

    if len(extracted_subfolders) != 1:
        print(f"⚠️ Warning: Multiple folders found in '{extract_folder}': {extracted_subfolders}")
        # Attempt to guess based on zip_filename
        folder_guess = os.path.splitext(zip_filename)[0]
        if folder_guess in extracted_subfolders:
            subfolder_path = os.path.join(extract_folder, folder_guess)
        else:
            subfolder_path = os.path.join(extract_folder, extracted_subfolders[0])
    else:
        subfolder_path = os.path.join(extract_folder, extracted_subfolders[0])

    print(f"📂 Extracted folder: {subfolder_path}")

    # Find all CSVs
    csv_files = glob.glob(os.path.join(subfolder_path, '*.csv'))
    print(f"🔍 Found {len(csv_files)} CSV files.")

    # Combine CSVs with taxa
    dfs = []
    for file in csv_files:
        df = pd.read_csv(file)

        # Infer taxa from filename
        base_name = os.path.basename(file)
        matched_taxa = None
        for key, value in taxa_map.items():
            if key in base_name:
                matched_taxa = value
                break

        if matched_taxa is None:
            print(f"⚠️ Warning: Could not match taxa for file '{base_name}'. Defaulting to 'unknown'.")
            matched_taxa = 'unknown'

        df['taxa'] = matched_taxa
        dfs.append(df)

    combined_df = pd.concat(dfs, axis=0, ignore_index=True)

    print(f"✅ Combined shape: {combined_df.shape}")
    print(combined_df['taxa'].value_counts())

    # Save and download
    combined_df.to_csv(output_filename, index=False)
    files.download(output_filename)

    return combined_df


In [None]:
# uploaded=files.upload() SPO11_AA_Features.zip

In [None]:
"""
taxa_map = {
            'cleaned_SPO11_A_full_features': 'arthropods',
            'cleaned_SPO11_C_full_features': 'chordates',
            'cleaned_SPO11_NOTCA_full_features': 'other animals',
            'cleaned_SPO11_fungi_full_features': 'fungi',
            'cleaned_SPO11_plant_full_features': 'plants'
        }
combine_csv_from_zip(zip_filename="SPO11_AA_Features.zip",
                         output_filename='SPO11_combined_features.csv',
                         taxa_map=taxa_map)
"""

'\ntaxa_map = {\n            \'cleaned_SPO11_A_full_features\': \'arthropods\',\n            \'cleaned_SPO11_C_full_features\': \'chordates\',\n            \'cleaned_SPO11_NOTCA_full_features\': \'other animals\',\n            \'cleaned_SPO11_fungi_full_features\': \'fungi\',\n            \'cleaned_SPO11_plant_full_features\': \'plants\'\n        }\ncombine_csv_from_zip(zip_filename="SPO11_AA_Features.zip",  \n                         output_filename=\'SPO11_combined_features.csv\',\n                         taxa_map=taxa_map)\n'

In [None]:
uploaded = files.upload()
def add_taxa_to_non_meiosis(input_file='non-meiosis_combined_aa_features.csv', output_file='non-meiosis_combined_aa_features_with_taxa.csv'):
    """
    Add a 'taxa' column with the value 'non-meiosis sources' to the non-meiosis embeddings file.
    """
    df = pd.read_csv(input_file)
    df['taxa'] = 'non-meiosis sources'
    df.to_csv(output_file, index=False)
    print(f"✅ Added 'taxa' column to {input_file}, saved as {output_file}.")
    return df

# Example usage:
non_meiosis_df = add_taxa_to_non_meiosis()

✅ Added 'taxa' column to non-meiosis_combined_aa_features.csv, saved as non-meiosis_combined_aa_features_with_taxa.csv.


In [None]:
def combine_embeddings(meiosis_csv, non_meiosis_csv, output_csv, label_meiosis=1, label_non_meiosis=0):
    """
    Combines a meiosis CSV file (e.g., SPO11) and a non-meiosis CSV file, adds labels, and saves as a new CSV.

    Parameters:
    - meiosis_csv: Path to meiosis CSV file (e.g., SPO11_combined_cls_embeddings.csv).
    - non_meiosis_csv: Path to non-meiosis CSV file (e.g., non-meiosis_combined_cls_embeddings.csv).
    - output_csv: Path to save combined CSV file.
    - label_meiosis: Label for meiosis proteins (default 1).
    - label_non_meiosis: Label for non-meiosis proteins (default 0).
    """

    # Load CSVs
    meiosis_df = pd.read_csv(meiosis_csv)
    non_meiosis_df = pd.read_csv(non_meiosis_csv)

    # Add labels
    meiosis_df['label'] = label_meiosis
    non_meiosis_df['label'] = label_non_meiosis

    # Combine the dataframes
    combined_df = pd.concat([meiosis_df, non_meiosis_df], axis=0, ignore_index=True)

    # Check combined shape and class distribution
    print(f"✅ Combined shape: {combined_df.shape}")
    print("Class distribution:\n", combined_df['label'].value_counts())

    # Save combined CSV
    combined_df.to_csv(output_csv, index=False)
    print(f"✅ Combined CSV saved as: {output_csv}")

In [None]:
# combine_embeddings("SPO11_combined_features.csv",'non-meiosis_combined_aa_features_with_taxa.csv','non-meiosis_spo11_combined_aa_features_with_taxa.csv')

In [None]:
"""
df = pd.read_csv('non-meiosis_spo11_combined_aa_features_with_taxa.csv')

# Rename first column to 'ID'
first_col = df.columns[0]
df = df.rename(columns={first_col: 'ID'})
df.to_csv('non-meiosis_spo11_combined_aa_features_with_taxa.csv', index=False)
"""

"\ndf = pd.read_csv('non-meiosis_spo11_combined_aa_features_with_taxa.csv')\n\n# Rename first column to 'ID'\nfirst_col = df.columns[0]\ndf = df.rename(columns={first_col: 'ID'})\ndf.to_csv('non-meiosis_spo11_combined_aa_features_with_taxa.csv', index=False)\n"

In [None]:
from google.colab import files
import pandas as pd
from sklearn.preprocessing import StandardScaler

def standardize_features_with_taxa(input_csv, output_csv):
    '''
    Standardize feature columns, keeping ID, taxa, and label columns intact.

    Parameters:
        input_csv (str): Path to input CSV file
        output_csv (str): Path for output CSV
    '''
    # Read CSV
    df = pd.read_csv(input_csv)

    # Identify columns
    id_col = df.columns[0]           # First column: ID
    taxa_col = df.columns[-2]        # Second last column: taxa
    label_col = df.columns[-1]       # Last column: label

    # Extract columns
    seq_ids = df[id_col]
    taxa = df[taxa_col]
    labels = df[label_col]

    # Features are everything else
    feature_cols = df.columns[1:-2]
    features = df[feature_cols]

    # Standardize features
    scaler = StandardScaler()
    features_scaled = pd.DataFrame(scaler.fit_transform(features), columns=feature_cols)

    # Recombine
    df_normalized = pd.concat([seq_ids, features_scaled, taxa, labels], axis=1)

    # Save output
    df_normalized.to_csv(output_csv, index=False)
    print(f"✅ Saved standardized data to {output_csv}")

    return df_normalized

In [None]:
# standardize_features_with_taxa('non-meiosis_spo11_combined_aa_features_with_taxa.csv',
#                                'std_non-meiosis_spo11_combined_aa_features_with_taxa.csv')

In [None]:
# uploaded=files.upload() spo11_mrmr_selected_aa_50_with_entropy.csv

Saving spo11_mrmr_selected_aa_50_with_entropy.csv to spo11_mrmr_selected_aa_50_with_entropy.csv


In [None]:
# uploaded=files.upload() spo11_mrmr_selected_aa_50.csv

In [None]:
import pandas as pd

def combine_with_selected_features(
    input_csv='std_non-meiosis_spo11_combined_aa_features_with_taxa.csv',
    selected_features_csv='spo11_mrmr_selected_aa_50_with_entropy.csv',
    output_csv='spo11_mrmr_selected_aa_features_50_with_taxa.csv'
):
    """
    Combines selected features with original ID, taxa, and label columns.

    Parameters:
        input_csv (str): Path to standardized CSV with ID, taxa, and label.
        selected_features_csv (str): Path to selected features CSV (DMC1 50 features).
        output_csv (str): Path to save the final combined CSV.
    """
    # Load datasets
    df_main = pd.read_csv(input_csv)
    df_selected = pd.read_csv(selected_features_csv)

    # Get columns
    id_col = df_main.columns[0]     # First column
    taxa_col = df_main.columns[-2]  # Second last
    label_col = df_main.columns[-1] # Last

    # Extract parts from main
    df_id = df_main[[id_col]]
    df_taxa = df_main[[taxa_col]]
    df_label = df_main[[label_col]]

    # Extract selected features (columns 2 to second last)
    df_selected_features = df_selected.iloc[:, 1:-1]

    # Combine
    df_final = pd.concat([df_id, df_selected_features, df_taxa, df_label], axis=1)

    # Save
    df_final.to_csv(output_csv, index=False)
    print(f"✅ Combined dataset saved to {output_csv}")
    print(f"Shape: {df_final.shape}")

    return df_final


In [None]:
# combine_with_selected_features()

In [None]:
# files.download('spo11_mrmr_selected_aa_features_50_with_taxa.csv')

In [None]:
"""
combine_with_selected_features(
    input_csv='std_non-meiosis_spo11_combined_aa_features_with_taxa.csv',
    selected_features_csv='spo11_mrmr_selected_aa_50.csv',
    output_csv='spo11_mrmr_selected_aa_features_50_with_taxa1.csv'
)
"""

"\ncombine_with_selected_features(\n    input_csv='std_non-meiosis_spo11_combined_aa_features_with_taxa.csv',\n    selected_features_csv='spo11_mrmr_selected_aa_50.csv',\n    output_csv='spo11_mrmr_selected_aa_features_50_with_taxa1.csv'\n)\n"

In [None]:
# files.download('spo11_mrmr_selected_aa_features_50_with_taxa1.csv')

In [None]:
# uploaded=files.upload() spo11_mrmr_selected_aa_100_with_entropy.csv

In [None]:
# uploaded=files.upload() spo11_mrmr_selected_aa_100.csv

In [None]:
"""
combine_with_selected_features(
    input_csv='std_non-meiosis_spo11_combined_aa_features_with_taxa.csv',
    selected_features_csv='spo11_mrmr_selected_aa_100_with_entropy.csv',
    output_csv='spo11_mrmr_selected_aa_features_100_with_taxa.csv'
)
"""

"\ncombine_with_selected_features(\n    input_csv='std_non-meiosis_spo11_combined_aa_features_with_taxa.csv',\n    selected_features_csv='spo11_mrmr_selected_aa_100_with_entropy.csv',\n    output_csv='spo11_mrmr_selected_aa_features_100_with_taxa.csv'\n)\n"

In [None]:
# files.download('spo11_mrmr_selected_aa_features_100_with_taxa.csv')

In [None]:
"""
combine_with_selected_features(
    input_csv='std_non-meiosis_spo11_combined_aa_features_with_taxa.csv',
    selected_features_csv='spo11_mrmr_selected_aa_100.csv',
    output_csv='spo11_mrmr_selected_aa_features_100_with_taxa1.csv'
)
"""

"\ncombine_with_selected_features(\n    input_csv='std_non-meiosis_spo11_combined_aa_features_with_taxa.csv',\n    selected_features_csv='spo11_mrmr_selected_aa_100.csv',\n    output_csv='spo11_mrmr_selected_aa_features_100_with_taxa1.csv'\n)\n"

In [None]:
# files.download('spo11_mrmr_selected_aa_features_100_with_taxa1.csv')