In [None]:
import csv
import os
import re
import requests
from Bio import SeqIO
from io import StringIO

INPUT_CSV = "files.csv"  
OUTPUT_CSV = "output.csv" 
FASTA_FOLDER = "fastas"   

def get_all_urls(cell_value):
    """
    Splits a cell that may contain multiple URLs separated by semicolons (or commas, spaces).
    Returns a list of cleaned URLs (stripped of whitespace and trailing semicolons).
    """
    # Split by semicolons or whitespace. You can add commas if needed:
    # re.split(r'[;\s,]+', cell_value.strip())
    parts = re.split(r'[;\s]+', cell_value.strip())
    
    # Remove empty strings and trailing punctuation
    urls = []
    for part in parts:
        cleaned = part.strip().rstrip(";").rstrip(",").rstrip("/")
        # Keep only if it starts with 'http' (basic check for a valid URL)
        if cleaned.startswith("http"):
            urls.append(cleaned)
    return urls

def extract_pdb_id_from_url(url):
    """
    Extract the PDB ID from an RCSB structure URL.
    Example: 'https://www.rcsb.org/structure/7DJZ' -> '7DJZ'
    """
    # After stripping trailing slashes, take the last part
    return url.split("/")[-1]

def download_fasta(pdb_id):
    """
    Download the FASTA for a given PDB ID from RCSB.
    Returns the FASTA text or None if there is an error.
    """
    url = f"https://www.rcsb.org/fasta/entry/{pdb_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Error retrieving {pdb_id}: HTTP {response.status_code}")
        return None

def parse_fasta_for_chains(fasta_text):
    """
    Parse the FASTA text and separate sequences into heavy_chains and light_chains.
    Returns two lists: [sequences for heavy], [sequences for light].
    """
    heavy_chains = []
    light_chains = []
    
    fasta_io = StringIO(fasta_text)
    for record in SeqIO.parse(fasta_io, "fasta"):
        header = record.description.lower()
        seq_str = str(record.seq)
        
        if "heavy" in header or "h chain" in header:
            heavy_chains.append(seq_str)
        elif "light" in header or "l chain" in header:
            light_chains.append(seq_str)
        # else: not obviously heavy or light
    
    return heavy_chains, light_chains

def main():
    # 1. Create the FASTA folder if it doesn't exist
    if not os.path.exists(FASTA_FOLDER):
        os.makedirs(FASTA_FOLDER)
    
    output_rows = []
    
    with open(INPUT_CSV, "r", newline="", encoding="utf-8") as infile:
        reader = csv.reader(infile)
        
        # If your CSV has a header row, read it
        header = next(reader, None)  # comment out if you have no header
        
        for row in reader:
            # We assume:
            #   row[0] = CloneName
            #   row[1] = BindsTo
            #   row[2] = cell containing one or more URLs
            if len(row) < 3:
                continue
            
            clone_name = row[0].strip()
            binds_to   = row[1].strip()
            cell_with_urls = row[2].strip()
            
            # Get *all* URLs from the cell
            urls = get_all_urls(cell_with_urls)
            
            # Prepare lists to collect sequences across all URLs
            all_heavy_sequences = []
            all_light_sequences = []
            
            # Download and parse each URL’s FASTA
            for url in urls:
                pdb_id = extract_pdb_id_from_url(url)
                fasta_text = download_fasta(pdb_id)
                
                if fasta_text:
                    # Save FASTA to disk
                    fasta_filename = os.path.join(FASTA_FOLDER, f"{pdb_id}.fasta")
                    with open(fasta_filename, "w", encoding="utf-8") as f_out:
                        f_out.write(fasta_text)
                    
                    # Parse for heavy/light sequences
                    heavy_list, light_list = parse_fasta_for_chains(fasta_text)
                    
                    # Add them to our master lists
                    all_heavy_sequences.extend(heavy_list)
                    all_light_sequences.extend(light_list)
                else:
                    # Could not download for this URL; skip
                    pass
            
            # Combine all sequences found
            heavy_seq_str = ";".join(all_heavy_sequences)
            light_seq_str = ";".join(all_light_sequences)
            
            # Count them
            heavy_count = len(all_heavy_sequences)
            light_count = len(all_light_sequences)
            
            # Store the row in our output
            output_rows.append({
                "CloneName": clone_name,
                "BindsTo": binds_to,
                "AllURLs": cell_with_urls,             # Keep the original text of all URLs
                "HeavyChainCount": heavy_count,
                "LightChainCount": light_count,
                "HeavyChainSequences": heavy_seq_str,
                "LightChainSequences": light_seq_str
            })
    
    # Write results to a new CSV
    fieldnames = [
        "CloneName",
        "BindsTo",
        "AllURLs",
        "HeavyChainCount",
        "LightChainCount",
        "HeavyChainSequences",
        "LightChainSequences"
    ]
    
    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        for out_row in output_rows:
            writer.writerow(out_row)

if __name__ == "__main__":
    main()