# 1. Extracting Information, creating a data frame of protein

In [7]:
import os
import pandas as pd
import re

def parse_fasta(fasta_file):
    headers = []
    sequences = []
    species_names = []
    sequence = ""
    
    # Open and read the file
    with open(fasta_file, 'r') as file:
        for line in file:
            line = line.strip()  # Remove whitespace
            if line.startswith(">"):
                # If sequence is not empty, store the previous one
                if sequence:
                    sequences.append(sequence)
                    sequence = ""  # Reset sequence for the next entry
                
                # Add the header (without the '>' symbol)
                headers.append(line[1:])
                # Extract species name using regex
                match = re.search(r"\[(.*?)\]", line)
                species_names.append(match.group(1) if match else "Unknown")
            else:
                # Add the sequence part
                sequence += line
        
        # Add the last sequence
        if sequence:
            sequences.append(sequence)
    
    # Create a DataFrame
    df = pd.DataFrame({
        'Header': headers,
        'Sequence': sequences,
        'Organism': species_names
    })
    
    return df

def parse_multiple_fastas(directory):
    # Get all FASTA files in the directory
    fasta_files = [f for f in os.listdir(directory) if f.endswith('.faa')]
    
    all_dfs = []  # List to store all DataFrames

    # Iterate through each file and parse
    for fasta_file in fasta_files:
        file_path = os.path.join(directory, fasta_file)
        df = parse_fasta(file_path)
        all_dfs.append(df)
    
    # Concatenate all DataFrames into one
    final_df = pd.concat(all_dfs, ignore_index=True)
    
    return final_df

# Usage example
directory = "protein-fasta-files/"
combined_df = parse_multiple_fastas(directory)

# Save to CSV or display
combined_df.to_csv("combined_fasta_data.csv", index=False)
combined_df.head()

Unnamed: 0,Header,Sequence,Organism
0,NP_001018029.1 Coq21p [Saccharomyces cerevisia...,MRNELYQLWCVASAARGVAKSSFVRANSAMCEYVRTSNVLSRWTRD...,Saccharomyces cerevisiae S288C
1,NP_001018030.1 L-serine/L-threonine ammonia-ly...,MSIVYNKTPLLRQFFPGKASAQFFLKYECLQPSGSFKSRGIGNLIM...,Saccharomyces cerevisiae S288C
2,NP_001018031.2 Adf1p [Saccharomyces cerevisiae...,MGKCSMKKKGVGKNVGVGKKVQKKRSISTAERKRTKLQVEKLNKSS...,Saccharomyces cerevisiae S288C
3,NP_001018032.1 uncharacterized protein YCR095W...,MTVLIKLGLRILHVYKGFFRKVILKYFFFSSEHTKVNKKSSMHAFL...,Saccharomyces cerevisiae S288C
4,NP_001018033.3 uncharacterized protein YGR161W...,MSGYFNHLSSNAHFANIQADQGFIGDATGTSSDHGSSGMVDFALQL...,Saccharomyces cerevisiae S288C
