In [29]:
from google.colab import files
import pandas as pd
import re

# Upload and load file
# uploaded = files.upload()
# filename = next(iter(uploaded))
# df = pd.read_csv(filename)

# Parse FASTA-style header
def parse_fasta_header(header):
    """
    Parses a FASTA-style header string to extract:
    - accession: the first word in the header
    - gene: the second word (if any)
    - organism: value inside [organism=...]
    - gene_id: value inside [GeneID=...]

    Returns:
    A pandas Series with [accession, gene, organism, gene_id]
    """
    header = str(header).lstrip('>').strip()

    """ Split header by whitespace, take the first as accession """
    parts = header.split(maxsplit=1)
    accession = parts[0] if parts else None

    """ Extract gene name from second part if available """
    gene = re.search(r'^(\S+)', parts[1]).group(1) if len(parts) > 1 else None

    """ Extract organism name enclosed in [organism=...] """
    organism = re.search(r'\[organism=([^\]]+)\]', header)

    """ Extract NCBI GeneID enclosed in [GeneID=...] """
    gene_id = re.search(r'\[GeneID=(\d+)\]', header)

    return pd.Series([
        accession,
        gene,
        organism.group(1).strip() if organism else None,
        int(gene_id.group(1)) if gene_id else None
    ])

# Apply parser to the first column
header_col = df.columns[0]
df[['accession', 'gene', 'organism', 'gene_id']] = df[header_col].apply(parse_fasta_header)

""" Move parsed columns to the front for readability """
parsed_cols = ['accession', 'gene', 'organism', 'gene_id']
df = df[parsed_cols + [col for col in df.columns if col not in parsed_cols]]

""" Drop redundant or unwanted columns, rename first to 'ID' """
df = df.drop(columns=[col for col in ['ID', 'gene', 'gene_id'] if col in df.columns])
df.columns.values[0] = 'ID'  # Renaming 'accession' to 'ID'

""" Save the cleaned DataFrame to CSV and trigger download """
output_file = "parsed_dataset_cleaned.csv"
df.to_csv(output_file, index=False)
# files.download(output_file)

""" Preview the resulting DataFrame """
print("✅ Saved and downloaded:", output_file)
print("🔍 Sample:")
print(df.head())

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Saved and downloaded: parsed_dataset_cleaned.csv
🔍 Sample:
               ID organism         0         1         2         3         4  \
0  XP_003173707.1     None -0.006135  0.050274  0.013580  0.151005 -0.110469   
1  XP_003868575.1     None -0.166736 -0.026779  0.034778  0.162941 -0.149222   
2  XP_009155451.1     None -0.048188  0.044912 -0.005841  0.136747 -0.109549   
3  XP_012053617.1     None -0.008090 -0.011336  0.013626  0.126519 -0.100549   
4  XP_012053797.1     None -0.019506 -0.037147  0.065685  0.093515 -0.189973   

          5         6         7  ...       632       633       634       635  \
0 -0.026739 -0.060721 -0.081719  ...  0.648811  0.026623  0.204221  0.169833   
1 -0.005653 -0.121203 -0.115645  ...  0.783493 -0.040031  0.269652  0.146528   
2 -0.109485 -0.040130 -0.016120  ...  0.640124  0.040284  0.176221  0.161390   
3 -0.049484 -0.072294 -0.140488  ...  0.810893 -0.043094  0.182306  0.136243   
4  0.005849 -0.127986 -0.047920  ...  0.510138  0.022441  

In [None]:
files.download(output_file)