In [1]:
print('   ___ _  _     _                         _       \n  / _ \ || |   /_\  ___ ___  __ _ ___ ___(_)_ __  \n / /_\/ || |_ //_\\/ __/ __|/ _` / __/ __| | _ \ \n/ /_\\|__   _/  _  \__ \__ \ (_| \__ \__ \ | | | |\n\____/   |_| \_/ \_/___/___/\___|___/___/_|_| |_|\n\n')

   ___ _  _     _                         _       
  / _ \ || |   /_\  ___ ___  __ _ ___ ___(_)_ __  
 / /_\/ || |_ //_\/ __/ __|/ _` / __/ __| | _ \ 
/ /_\|__   _/  _  \__ \__ \ (_| \__ \__ \ | | | |
\____/   |_| \_/ \_/___/___/\___|___/___/_|_| |_|




In [2]:
import pandas as pd
import os
import glob
from Bio import SeqIO
import subprocess
fastaFile = 'Genomes/DOUBLETESTER.fasta'
Mito = 'Mitochondria_NC_012920_1.fasta'
GENOMES = 'FINAL/'

#Global integers
WINDOWVAL = 25
THRESHOLDVAL = 1.2


# Global strings
G4HUNTER = 'G4HunterEDITED.py'


# Output stats globals
WINDOW = 'Window'
THRESHOLD = 'Threshold'
NPQS = 'Number of PQS'
BP = 'Base Pairs'
NGC = 'Number of GCs'
PGC = 'Percentage of GCs'
FRQ = 'Frequency of PQS'

# G4 Result .txt file to Database Globals
SEQ_ID = 'Sequence_ID'
START = 'Start'
END = 'End'
SEQUENCE = 'Sequence'
LENGTH = 'Length'
SCORE = 'Score'

In [3]:
# Functions
def process_txt_file(file_path):
    data = []
    current_sequence = None
    PQS = 0
    
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            
            # If the line starts with '>', it indicates a new sequence section
            if line.startswith('>'):
                current_sequence = line[1:]  # Remove '>' and store the sequence identifier
            elif line and not line.startswith(START):
                # Process the data lines
                parts = line.split()
                if len(parts) == 5:  # Ensures correct number of columns
                    start, end, sequence, length, score = parts
                    data.append({
                        SEQ_ID: current_sequence,
                        START: int(start),
                        END: int(end),
                        SEQUENCE: sequence,
                        LENGTH: int(length),
                        SCORE: float(score)
                    })
                elif len(parts) == 6:  # Ensures correct number of columns
                    start, end, sequence, length, score, nbr = parts
                    data.append({
                        SEQ_ID: current_sequence,
                        START: int(start),
                        END: int(end),
                        SEQUENCE: sequence,
                        LENGTH: int(length),
                        SCORE: float(score)
                    })
                    
                    PQS += int(nbr) # Save number of putative quadruplex sequences
    
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)
    return df, PQS

def runG4(inputFile):
    print('Running G4Hunter:', inputFile)
    result = subprocess.run(
    ['python3', G4HUNTER, '-i', inputFile, '-o', 'Results', '-w', str(WINDOWVAL), '-s', str(THRESHOLDVAL)],
                        capture_output=True, text=True)
    return result.stdout.strip()


def overallStats(filePath):
    outputStats = {NPQS:0, BP: 0, NGC:0, PGC:0.0, FRQ: 0.0, WINDOW:WINDOWVAL, THRESHOLD:THRESHOLDVAL}
    
    # Parse all sequences in the file
    for record in SeqIO.parse(filePath, "fasta"):
        sequence = record.seq.upper()  # Convert to uppercase
        outputStats[BP] += len(sequence)
        outputStats[NGC] += sequence.count("G") + sequence.count("C")
    return outputStats

def findFastaFiles(directory):
    # Define the FASTA file extensions you want to search for
    fastaExtensions = ['*.fasta', '*.fa', '*.fna', '*.ffn', '*.faa', '*.frn']
    
    # Initialize an empty list to store the file paths
    fastaFiles = []
    
    # Iterate over each file extension pattern
    for ext in fastaExtensions:
        # Use glob to search for files matching the pattern within the directory and its subdirectories
        files = glob.glob(os.path.join(directory, '**', ext), recursive=True)
        # Append found files to the list
        fastaFiles.extend(files)
    
    return fastaFiles

def main(filePath):
    return runG4(filePath), overallStats(filePath)




# Main Function

def assassinate(filePath):
    
    # Make list of all files, iterate, produce the filepath and stats, save the overall stats 
    # in the same folder as the generated file path. 
    
    for file in findFastaFiles(filePath):
        print('Processing:', file)
        generatedFilepath, overallStats = main(file)
        overallStats[NPQS] = process_txt_file(generatedFilepath)[1]
        overallStats[PGC] = overallStats[NGC]/overallStats[BP]
        overallStats[FRQ] = overallStats[NPQS]/overallStats[BP]
        
        
        # Save output data
        directory = os.path.dirname(generatedFilepath)
        base_name = os.path.splitext(os.path.basename(generatedFilepath))[0]
        csv_file_name = f"{base_name}.csv"
        csv_file_path = os.path.join(directory, csv_file_name)
        df = pd.DataFrame([overallStats])
        df.to_csv(csv_file_path, index=False)
        print('Saved to', csv_file_path)
    
    return

In [4]:
'''result = subprocess.run(
    ['python3', G4HUNTER, '-i', fasta_file, '-o', 'Results', '-w', str(WINDOW), '-s', str(THRESHHOLD)],
                        capture_output=True, text=True)
generated_filepath = result.stdout.strip()'''

'''generatedFilepath = runG4(fastaFile)
print(generatedFilepath)
print(process_txt_file(generatedFilepath))'''

assassinate(GENOMES)

#sequences = list(SeqIO.parse(fastaFile, "fasta"))

Processing: FINAL/Leishmania/GCA_963920605.1/GCA_963920605.1_Parent1_genomic.fna
Running G4Hunter: FINAL/Leishmania/GCA_963920605.1/GCA_963920605.1_Parent1_genomic.fna
Saved to Results/Results_GCA_963920605/GCA_963920605-Merged.csv
Processing: FINAL/Leishmania/GCF_000002725.2/GCF_000002725.2_ASM272v2_genomic.fna
Running G4Hunter: FINAL/Leishmania/GCF_000002725.2/GCF_000002725.2_ASM272v2_genomic.fna
Saved to Results/Results_GCF_000002725/GCF_000002725-Merged.csv
Processing: FINAL/Leishmania/GCA_037178345.1/GCA_037178345.1_ASM3717834v1_genomic.fna
Running G4Hunter: FINAL/Leishmania/GCA_037178345.1/GCA_037178345.1_ASM3717834v1_genomic.fna
Saved to Results/Results_GCA_037178345/GCA_037178345-Merged.csv
Processing: FINAL/Leishmania/GCA_037177955.1/GCA_037177955.1_ASM3717795v1_genomic.fna
Running G4Hunter: FINAL/Leishmania/GCA_037177955.1/GCA_037177955.1_ASM3717795v1_genomic.fna
Saved to Results/Results_GCA_037177955/GCA_037177955-Merged.csv
Processing: FINAL/Leishmania/GCA_037178005.1/GCA_0

Saved to Results/Results_GCA_000409445/GCA_000409445-Merged.csv
Processing: FINAL/Leishmania/GCA_040363675.1/GCA_040363675.1_ASM4036367v1_genomic.fna
Running G4Hunter: FINAL/Leishmania/GCA_040363675.1/GCA_040363675.1_ASM4036367v1_genomic.fna
Saved to Results/Results_GCA_040363675/GCA_040363675-Merged.csv
Processing: FINAL/Leishmania/GCA_037177945.1/GCA_037177945.1_ASM3717794v1_genomic.fna
Running G4Hunter: FINAL/Leishmania/GCA_037177945.1/GCA_037177945.1_ASM3717794v1_genomic.fna
Saved to Results/Results_GCA_037177945/GCA_037177945-Merged.csv
Processing: FINAL/Leishmania/GCA_037178015.1/GCA_037178015.1_ASM3717801v1_genomic.fna
Running G4Hunter: FINAL/Leishmania/GCA_037178015.1/GCA_037178015.1_ASM3717801v1_genomic.fna
Saved to Results/Results_GCA_037178015/GCA_037178015-Merged.csv
Processing: FINAL/Leishmania/GCA_037178505.1/GCA_037178505.1_ASM3717850v1_genomic.fna
Running G4Hunter: FINAL/Leishmania/GCA_037178505.1/GCA_037178505.1_ASM3717850v1_genomic.fna
Saved to Results/Results_GCA_037

Saved to Results/Results_GCA_000331345/GCA_000331345-Merged.csv
Processing: FINAL/Leishmania/GCA_900537975.2/GCA_900537975.2_LBRM_annotationDEFINITIVO_genomic.fna
Running G4Hunter: FINAL/Leishmania/GCA_900537975.2/GCA_900537975.2_LBRM_annotationDEFINITIVO_genomic.fna
Saved to Results/Results_GCA_900537975/GCA_900537975-Merged.csv
Processing: FINAL/Leishmania/GCA_003992445.1/GCA_003992445.1_ASM399244v1_genomic.fna
Running G4Hunter: FINAL/Leishmania/GCA_003992445.1/GCA_003992445.1_ASM399244v1_genomic.fna
Saved to Results/Results_GCA_003992445/GCA_003992445-Merged.csv
Processing: FINAL/Leishmania/GCA_916722125.1/GCA_916722125.1_LMJFC_annotationDEFINITIVO_genomic.fna
Running G4Hunter: FINAL/Leishmania/GCA_916722125.1/GCA_916722125.1_LMJFC_annotationDEFINITIVO_genomic.fna
Saved to Results/Results_GCA_916722125/GCA_916722125-Merged.csv
Processing: FINAL/Leishmania/GCA_037178585.1/GCA_037178585.1_ASM3717858v1_genomic.fna
Running G4Hunter: FINAL/Leishmania/GCA_037178585.1/GCA_037178585.1_ASM37

Saved to Results/Results_GCA_040363505/GCA_040363505-Merged.csv
Processing: FINAL/Leishmania/GCA_024505685.1/GCA_024505685.1_BA788_genomic.fna
Running G4Hunter: FINAL/Leishmania/GCA_024505685.1/GCA_024505685.1_BA788_genomic.fna
Saved to Results/Results_GCA_024505685/GCA_024505685-Merged.csv
Processing: FINAL/Leishmania/GCA_000002845.2/GCA_000002845.2_ASM284v2_genomic.fna
Running G4Hunter: FINAL/Leishmania/GCA_000002845.2/GCA_000002845.2_ASM284v2_genomic.fna


KeyboardInterrupt: 