# 1. Extracting Information, creating a data frame of protein

In [1]:
import os
import pandas as pd

def parse_fasta(fasta_file):
    headers = []
    sequences = []
    species_names = []
    sequence = ""
    
    # Open and read the file
    with open(fasta_file, 'r') as file:
        for line in file:
            line = line.strip()  # Remove whitespace
            if line.startswith(">"):
                # If sequence is not empty, store the previous one
                if sequence:
                    sequences.append(sequence)
                    sequence = ""  # Reset sequence for the next entry
                
                # Add the header (without the '>' symbol)
                headers.append(line[1:])
        
            else:
                # Add the sequence part
                sequence += line

                # Extract species name from .faa file name
                species_names = str(fasta_file)[20:-12]
        
        # Add the last sequence
        if sequence:
            sequences.append(sequence)
    
    # Create a DataFrame
    df = pd.DataFrame({
        'Header': headers,
        'Sequence': sequences,
        'Species': species_names
    })
    
    return df

def parse_multiple_fastas(directory):
    # Get all FASTA files in the directory
    fasta_files = [f for f in os.listdir(directory) if f.endswith('.faa')]
    
    all_dfs = []  # List to store all DataFrames

    # Iterate through each file and parse
    for fasta_file in fasta_files:
        file_path = os.path.join(directory, fasta_file)
        df = parse_fasta(file_path)
        all_dfs.append(df)
    
    # Concatenate all DataFrames into one
    final_df = pd.concat(all_dfs, ignore_index=True)
    
    return final_df

# Load protein fasta files
directory = "protein-fasta-files/"
combined_df = parse_multiple_fastas(directory)

# Save to CSV or display
combined_df.to_csv("combined_fasta_data.csv", index=False)
combined_df.head()

Unnamed: 0,Header,Sequence,Species
0,XP_005534764.1 L-lactate dehydrogenase [Cyanid...,MDLQHGGAFYSTRIRAAESYEDTAHSAVCIITAGVRQRPGESRLEL...,Cyanidioschyzon
1,XP_005534765.1 similar to cell surface glycopr...,MFRARSLVRGLVVAVCLLLAATFSVSLVAALSPVSTVSWTWIGGET...,Cyanidioschyzon
2,XP_005534766.1 similar to retinoblastoma-bindi...,MNRVLLDRLFPFVRAQVPEQVEFSLSEHGTGRCFAFTPLYGNVLAV...,Cyanidioschyzon
3,XP_005534767.1 cycloartenol synthase [Cyanidio...,MWRLHSDHGRQWWTFSTSDAETNLDDPALAKEAALIEEARRTFHEH...,Cyanidioschyzon
4,XP_005534768.1 hypothetical protein CYME_CMJ01...,MPTESGLQQRWRRGCAGGAGPQWWPPGRKPHGAGGTMPQPPTRDAP...,Cyanidioschyzon
