<a href="https://colab.research.google.com/github/eoinleen/Protein-design-random/blob/main/fatsa-list-to-csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# ===============================================
# FASTA SEQUENCE TO CSV CONVERTER
# ===============================================
#
# This tool converts biological sequence files from FASTA format to CSV
# for easier analysis and manipulation.
#
# FEATURES:
# - Reads FASTA/text files directly from Google Drive
# - Extracts sequence names and sequence data
# - Converts to a clean two-column CSV format
# - Saves output to the same Google Drive directory
# - Provides troubleshooting for common Drive issues
# - Displays preview of the conversion results
#
# INSTRUCTIONS:
# 1. Run this cell
# 2. Enter your Google Drive directory path when prompted
# 3. Enter the name of your sequence file (must end with .txt)
# 4. Review the conversion output and preview
#
# ===============================================



import pandas as pd
import re
import os
import traceback
from google.colab import drive
from IPython.display import display, HTML

# Function to parse FASTA-like format to CSV
def parse_sequences_to_csv(input_file, output_file):
    try:
        # Read the input file
        print(f"Reading input file: {input_file}")
        with open(input_file, 'r') as f:
            input_text = f.read()

        print(f"Input file length: {len(input_text)} characters")

        # Split input by new sequence indicators (lines starting with '>')
        entries = re.split(r'(?=>)', input_text.strip())
        print(f"Found {len(entries)} sequence entries")

        # Prepare data for DataFrame
        data = []

        for i, entry in enumerate(entries):
            if i < 5:  # Only print the first few entries for debugging
                print(f"Entry {i}: {entry[:50]}..." if len(entry) > 50 else f"Entry {i}: {entry}")

            if not entry.strip():
                continue

            # Split into lines
            lines = entry.strip().split('\n')

            # First line contains the sequence name (starting with '>')
            if lines[0].startswith('>'):
                name = lines[0][1:].strip()  # Remove the '>' character

                # Combine all subsequent lines as the sequence, removing any whitespace
                sequence = ''.join(lines[1:]).replace(' ', '')

                data.append({'Sequence_Name': name, 'Sequence': sequence})

        # Create DataFrame
        df = pd.DataFrame(data)
        print(f"Created DataFrame with {len(df)} rows and {len(df.columns)} columns")

        # Write to CSV
        print(f"Writing CSV to: {output_file}")
        df.to_csv(output_file, index=False)

        # Verify file was created
        if os.path.exists(output_file):
            print(f"CSV file successfully created at: {output_file}")
            print(f"File size: {os.path.getsize(output_file)} bytes")
        else:
            print(f"ERROR: CSV file was not created at: {output_file}")

        # Display first few rows
        return df
    except Exception as e:
        print(f"ERROR in parse_sequences_to_csv: {str(e)}")
        print(traceback.format_exc())
        raise

# Mount Google Drive with error handling
def mount_drive():
    try:
        drive.mount('/content/drive', force_remount=True)
        print("Google Drive mounted successfully")
        return True
    except Exception as e:
        print(f"ERROR mounting Google Drive: {str(e)}")
        print(traceback.format_exc())
        return False

# Define the working directory and files
def convert_txt_to_csv(drive_dir, input_filename):
    """
    Convert a sequence txt file from Google Drive to CSV format

    Parameters:
    drive_dir (str): The directory path in Google Drive
    input_filename (str): Name of the input .txt file (must end with .txt)
    """
    try:
        # Mount drive if needed
        if not os.path.exists('/content/drive'):
            if not mount_drive():
                return None

        if not input_filename.endswith('.txt'):
            raise ValueError("Input filename must end with .txt")

        # Construct full paths
        input_path = os.path.join(drive_dir, input_filename)
        output_filename = input_filename.replace('.txt', '.csv')
        output_path = os.path.join(drive_dir, output_filename)

        # Check if directories exist
        print(f"Checking directory: {drive_dir}")
        if not os.path.exists(drive_dir):
            raise FileNotFoundError(f"Directory not found: {drive_dir}")

        # Check if input file exists
        print(f"Checking input file: {input_path}")
        if not os.path.exists(input_path):
            raise FileNotFoundError(f"Input file not found: {input_path}")

        # Print permissions
        print(f"Directory permissions: {oct(os.stat(drive_dir).st_mode)[-3:]}")
        print(f"File permissions: {oct(os.stat(input_path).st_mode)[-3:]}")

        # Convert the file
        print(f"Converting {input_path} to {output_path}...")
        df = parse_sequences_to_csv(input_path, output_path)

        # Try writing to a different location as a test
        test_output = '/content/test_output.csv'
        print(f"Testing CSV writing capability to {test_output}...")
        df.to_csv(test_output, index=False)
        if os.path.exists(test_output):
            print(f"Test CSV file created successfully at {test_output}")
        else:
            print(f"Failed to create test CSV file at {test_output}")

        # Display preview
        print("\nPreview of the CSV data:")
        print(df.head())

        # Double check file creation
        if os.path.exists(output_path):
            print(f"\nConversion complete! CSV file saved to: {output_path}")
        else:
            print(f"\nWARNING: The output file does not exist at: {output_path}")
            print("This could be due to permission issues or path problems.")
            print(f"Try using an absolute path or check write permissions in {drive_dir}")

        return df
    except Exception as e:
        print(f"ERROR in convert_txt_to_csv: {str(e)}")
        print(traceback.format_exc())
        return None

# Additional troubleshooting function
def troubleshoot_drive_permissions(drive_dir):
    try:
        print("\nTROUBLESHOOTING DRIVE PERMISSIONS")
        print("================================")

        # Test if drive is mounted
        if not os.path.exists('/content/drive'):
            print("Google Drive is not mounted. Attempting to mount...")
            mount_drive()
        else:
            print("Google Drive is mounted")

        # Check if directory exists
        if not os.path.exists(drive_dir):
            print(f"Directory does not exist: {drive_dir}")
            parent_dir = os.path.dirname(drive_dir)
            if os.path.exists(parent_dir):
                print(f"Parent directory exists: {parent_dir}")
                print(f"Contents of parent directory: {os.listdir(parent_dir)}")
            else:
                print(f"Parent directory does not exist: {parent_dir}")
        else:
            print(f"Directory exists: {drive_dir}")
            print(f"Directory permissions: {oct(os.stat(drive_dir).st_mode)[-3:]}")
            print(f"Contents of directory: {os.listdir(drive_dir)}")

        # Test write permissions
        test_file = os.path.join(drive_dir, 'test_write_permission.txt')
        try:
            with open(test_file, 'w') as f:
                f.write('Test write permission')
            print(f"Successfully wrote test file to {test_file}")
            os.remove(test_file)
            print(f"Successfully removed test file from {test_file}")
        except Exception as e:
            print(f"Failed to write test file: {str(e)}")

        print("================================")
    except Exception as e:
        print(f"Error in troubleshooting: {str(e)}")
        print(traceback.format_exc())

# Main execution function with user input
def main():
    # Mount Google Drive
    if not os.path.exists('/content/drive'):
        print("Mounting Google Drive...")
        if not mount_drive():
            print("Failed to mount Google Drive. Exiting.")
            return

    # Get user input for directory
    print("\nPlease enter the path to your directory in Google Drive.")
    print("Example: /content/drive/MyDrive/Fasta-files/making-a-list")

    drive_directory = input("Google Drive directory path: ")

    # Verify directory exists
    if not os.path.exists(drive_directory):
        print(f"Directory not found: {drive_directory}")
        print("Would you like to troubleshoot this directory? (y/n)")
        if input().lower() == 'y':
            troubleshoot_drive_permissions(drive_directory)
        return

    # Get input file name
    print("\nPlease enter the name of your sequence file (must end with .txt):")
    input_file = input("File name: ")

    if not input_file.endswith('.txt'):
        print("Error: File name must end with .txt")
        return

    # Check if file exists
    file_path = os.path.join(drive_directory, input_file)
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return

    # Run conversion
    print("\nProcessing file...")
    result_df = convert_txt_to_csv(drive_directory, input_file)

    # Display the first 25 lines of the CSV
    if result_df is not None and not result_df.empty:
        print("\n====== FIRST 25 ROWS OF CSV OUTPUT ======")
        # Set display options to show full sequence content
        pd.set_option('display.max_colwidth', None)
        display(result_df.head(25))
        print("========================================")

        # Create HTML output for clearer viewing
        print("\nHere's a more formatted view of the first 25 rows:")
        display(HTML(result_df.head(25).to_html()))

        # Reset display options
        pd.reset_option('display.max_colwidth')
    else:
        print("No data available to display.")

# Initial instructions
print("""
# FASTA to CSV Converter for Google Colab

This script will:
1. Read a FASTA-format sequence file (.txt) from your Google Drive
2. Convert it to CSV format with columns for sequence name and sequence
3. Save the CSV in the same Google Drive directory
4. Display the first 25 rows of the output

Just run this cell and follow the prompts.
""")

# Run the main function when script is executed
if __name__ == "__main__":
    main()


# FASTA to CSV Converter for Google Colab

This script will:
1. Read a FASTA-format sequence file (.txt) from your Google Drive
2. Convert it to CSV format with columns for sequence name and sequence
3. Save the CSV in the same Google Drive directory
4. Display the first 25 rows of the output

Just run this cell and follow the prompts.


Please enter the path to your directory in Google Drive.
Example: /content/drive/MyDrive/Fasta-files/making-a-list
Google Drive directory path: /content/drive/MyDrive/Fasta-files/making-a-list

Please enter the name of your sequence file (must end with .txt):
File name: /content/drive/MyDrive/Fasta-files/making-a-list/all_sequences.txt

Processing file...
Checking directory: /content/drive/MyDrive/Fasta-files/making-a-list
Checking input file: /content/drive/MyDrive/Fasta-files/making-a-list/all_sequences.txt
Directory permissions: 700
File permissions: 600
Converting /content/drive/MyDrive/Fasta-files/making-a-list/all_sequences.txt to /content/drive

Unnamed: 0,Sequence_Name,Sequence
0,296_32,APPMTLEEFRKYVEDVKDFVAELGKLVAAGDPEFASPEEVAENILEVVYAVLVEHRELYLAHPEEVEALLRELIRTILEEFGLPVDDARIEALVARARELVENPE
1,164418_mpnn2_model1,SYEELAEKVINEFKKEIEKFKKKIENSDLPEGIKNFLIEMLDDLEHFMIFFIQEFFKQLKKEGIDPKEDPEKFLEKFEEWAKDVMMVMAWFMEENLTEYIKKNYPDMPEEEIEKIKEELRNFFLNVATPTLINTAREYIKENGY
2,1010_24,SMEALAKAVVDAANAKFDEFKKEIAESDWPEGIKKKIIELLDDFRVFITYYIYEFFETLEEKGIDPAKDPAAFLAEWTDWAESSMAVIAYFAKEQLEKYIAKEYPDMPEEEKQAIFDKIDDFYANVLAPTMVDTAKAYIAANGW
3,311379_mpnn1_model1,SEEEIAEKLFEVMEELAEVVEEFLIDKDSEEVEEEMEEFIEELEEFLELTNFTFLPKSIEELYEDIKSPEELLEKMKEIVKGFKENTELSEKLEKLVEMWEDLLVYLQFLSEEAKDKSPEQIRDMFETLKAVLEYRFNEIMEE
4,451607_mpnn5_model2,GEVRMREFETELTDEEGNKYKVVVYLFEETKVAFINKTSLEKDEEETWVQDVEEVDGKFEVQIGKNTFVVFSPDKVELVEK
5,4106_21,AELEAFLKRFEIAVAFQKEVGDEEALKVSLEVKEEYLKYVEA
6,215645_mpnn10_model2,SLDDLERENDKLTAGLPEEERRRMEQLVTYRMIVEMALTFLIDMFKGHDKKKEEKLEELLKKFKEKFLEGIKDPSKLIEAVEEVKKAIDFVKEELADDPFVKDFFTKILENTYKEMKKIVE
7,993298_mpnn1_model2,SKKKELEDEAAKIYEEMKEWAKKKAKEVAEEIKKNPEKAEELYMKLFDELEEKLKKKFEELASKDPEFAKKFPAAWESSQDAVDFFRHDLLHKLKDIKDPEERAKVAIDKFDTYLNFMTWLIVEKIYE
8,21,KEREEFLRRFDEAKRFNEEIGDEERLAVSKEVLEEYLRYVEA
9,150447_mpnn3_model2,SEEEIMKLRHEAQDFAWETAHRINFLIDQLKHSTDEVPEGTIEKLEEAREKIKKLLKELQDNEELSKEEILELMKVVVEEAEVLVEALEQHKNFPFPEMVESLKKEVEYWKKLVESS



Here's a more formatted view of the first 25 rows:


Unnamed: 0,Sequence_Name,Sequence
0,296_32,APPMTLEEFRKYVEDVKDFVAELGKLVAAGDPEFASPEEVAENILEVVYAVLVEHRELYLAHPEEVEALLRELIRTILEEFGLPVDDARIEALVARARELVENPE
1,164418_mpnn2_model1,SYEELAEKVINEFKKEIEKFKKKIENSDLPEGIKNFLIEMLDDLEHFMIFFIQEFFKQLKKEGIDPKEDPEKFLEKFEEWAKDVMMVMAWFMEENLTEYIKKNYPDMPEEEIEKIKEELRNFFLNVATPTLINTAREYIKENGY
2,1010_24,SMEALAKAVVDAANAKFDEFKKEIAESDWPEGIKKKIIELLDDFRVFITYYIYEFFETLEEKGIDPAKDPAAFLAEWTDWAESSMAVIAYFAKEQLEKYIAKEYPDMPEEEKQAIFDKIDDFYANVLAPTMVDTAKAYIAANGW
3,311379_mpnn1_model1,SEEEIAEKLFEVMEELAEVVEEFLIDKDSEEVEEEMEEFIEELEEFLELTNFTFLPKSIEELYEDIKSPEELLEKMKEIVKGFKENTELSEKLEKLVEMWEDLLVYLQFLSEEAKDKSPEQIRDMFETLKAVLEYRFNEIMEE
4,451607_mpnn5_model2,GEVRMREFETELTDEEGNKYKVVVYLFEETKVAFINKTSLEKDEEETWVQDVEEVDGKFEVQIGKNTFVVFSPDKVELVEK
5,4106_21,AELEAFLKRFEIAVAFQKEVGDEEALKVSLEVKEEYLKYVEA
6,215645_mpnn10_model2,SLDDLERENDKLTAGLPEEERRRMEQLVTYRMIVEMALTFLIDMFKGHDKKKEEKLEELLKKFKEKFLEGIKDPSKLIEAVEEVKKAIDFVKEELADDPFVKDFFTKILENTYKEMKKIVE
7,993298_mpnn1_model2,SKKKELEDEAAKIYEEMKEWAKKKAKEVAEEIKKNPEKAEELYMKLFDELEEKLKKKFEELASKDPEFAKKFPAAWESSQDAVDFFRHDLLHKLKDIKDPEERAKVAIDKFDTYLNFMTWLIVEKIYE
8,21,KEREEFLRRFDEAKRFNEEIGDEERLAVSKEVLEEYLRYVEA
9,150447_mpnn3_model2,SEEEIMKLRHEAQDFAWETAHRINFLIDQLKHSTDEVPEGTIEKLEEAREKIKKLLKELQDNEELSKEEILELMKVVVEEAEVLVEALEQHKNFPFPEMVESLKKEVEYWKKLVESS
