In [2]:
import os
import pandas as pd

In [11]:
base_dir = "/mnt/d/ebola/RNAfold0"

In [12]:
def split_sequence(sequence):
    split = sequence.split("\n")
    header = split[0]
    sequence = split[1]

    if len(split[2].split(" ")) == 2:
        oss, oss_mfe = split[2].split(" ")
        oss_mfe = float(oss_mfe[1:-1])
    elif len(split[2].split(" ")) == 3:
        oss, _, oss_mfe = split[2].split(" ")
        oss_mfe = float(oss_mfe[:-1])

    if len(split[3].split(" ")) == 2:
        tde, tde_fe = split[3].split(" ")
        tde_fe = float(tde_fe[1:-1])
    elif len(split[3].split(" ")) == 3:
        tde, _, tde_fe = split[3].split(" ")
        tde_fe = float(tde_fe[:-1])

    if len(split[4].split(" ")) == 3:
        css, css_mfe, _ = split[4].split(" ")
        css_mfe = float(css_mfe[1:])
    elif len(split[4].split(" ")) == 4:
        css, _, css_mfe, _ = split[4].split(" ")
        css_mfe = float(css_mfe[1:])
    elif len(split[4].split(" ")) == 5:
        css, _, _, css_mfe, _ = split[4].split(" ")
        css_mfe = float(css_mfe[1:])

    frequency = float(split[5].strip().split("; ")[0].split(" ")[-1])
    ensemble_diversity = float(split[5].strip().split("; ")[1].split(" ")[-1])
    return (
        header,
        sequence,
        oss,
        tde,
        css,
        oss_mfe,
        tde_fe,
        css_mfe,
        frequency,
        ensemble_diversity,
    )

In [13]:
cds_data = []
utr3_data = []
utr5_data = []

for gene_folder in os.listdir(base_dir):
    gene_folder_path = os.path.join(base_dir, gene_folder)
    if os.path.isdir(gene_folder_path):
        for file_name in os.listdir(gene_folder_path):
            if file_name.endswith(".out"):
                out_file_path = os.path.join(gene_folder_path, file_name)
                with open(out_file_path, "r") as file:
                    content = file.read()
                    sequences = content.split(">")

                    seq1 = split_sequence(sequences[1])
                    seq2 = split_sequence(sequences[2])
                    seq3 = split_sequence(sequences[3])

                    cds_data.append(seq1)
                    utr3_data.append(seq2)
                    utr5_data.append(seq3)

In [14]:
columns = [
    "gene",
    "sequence",
    "optimal_secondary_structure",
    "thermodynamic_ensemble",
    "centroid_secondary_structure",
    "optimal_secondary_structure_mfe",
    "thermodynamic_ensemble_fe",
    "centroid_secondary_structure_mfe",
    "frequency_of_mfe_structure",
    "ensemble_diversity",
]

cds_df = pd.DataFrame(cds_data, columns=columns)
utr3_df = pd.DataFrame(utr3_data, columns=columns)
utr5_df = pd.DataFrame(utr5_data, columns=columns)

# Set the header as the index for each dataframe
cds_df.set_index("gene", inplace=True)
utr3_df.set_index("gene", inplace=True)
utr5_df.set_index("gene", inplace=True)

In [15]:
cds_df.to_csv("cds_RNAfold.csv")
utr3_df.to_csv("utr3_RNAfold.csv")
utr5_df.to_csv("utr5_RNAfold.csv")

In [16]:
utr3_df

Unnamed: 0_level_0,sequence,optimal_secondary_structure,thermodynamic_ensemble,centroid_secondary_structure,optimal_secondary_structure_mfe,thermodynamic_ensemble_fe,centroid_secondary_structure_mfe,frequency_of_mfe_structure,ensemble_diversity
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AKT3-217 utr3:protein_coding,GUCUCUUUCAUUCUGCUACUUCACUGUCAUCUUCAAUUUAUUACUG...,.............((..((((.((((((((.((((.........))...,.............{(..((((.((((((((.((((.........))...,.............((..((((.((((((((.((((.........))...,-1483.6,-1578.76,-1158.74,8.837210000000001e-68,1389.46
CD9-201 utr3:protein_coding,AGUCAGCUUACAUCCCUGAGCAGGAAAGUUUACCCAUGAAGAUUGG...,..((((...((((....((((((((((.((((((.((....)).))...,"..,(((,..({{....,((((((((((.,,,,(({((....,,.})...",.................(((((((((.......................,-88.4,-99.43,-44.9,1.67752e-08,149.21
CLIP1-203 utr3:protein_coding,UGAAGCCUCCAGUGGAGAACUGGGCUUGCUCAGACGCACUCGCAUU...,...((((.(((((.....)))))((.(((......)))...))......,".{{((((.{((((,,,,.}||||(,.{{|,,,,..}},,,,}),,,...",....(((.(((((.....)))))..........................,-294.1,-323.18,-217.6,3.24295e-21,349.61
DCP1A-206 utr3:protein_coding,CUGGAGCAGAAUAAGUCUAAAGGCAGAGCCCCAGCCUCAGAGACAA...,..((((((((((..((((..((((.........))))...)))).....,",{(({(((({,.,,((({..{(({,,,,,,.,,}}}|.,,||}}.....",..............((((...(((.........)))....)))).....,-1185.1,-1257.95,-737.3,4.65276e-52,1169.14
EHD4-201 utr3:protein_coding,GGGGUGGGCUGCAGAACGGGGUGGGAACUGGGGGACCUGGGCCUCA...,.((((((((((.....((((((((((..((((((((((((((((.....,",(((.(((((({((..((((((((((..((((((((((((((((.....",.....(((((.(((..((((((((((..((((((((((((((((.....,-1642.28,-1723.3,-1245.56,8.08387e-58,1130.33
EIF2AK2-201 utr3:protein_coding,AGCCCUUCUGAAAAAGUAUCCUGCUUCUGAUAUGCAGUUUUCCUUA...,.....((((((...(((.((((((((((((.((((((((.....((...,".....((((((...{{{.((((((((((,.,,,((((,,,.........",.....((((((...(((.(((((((........................,-2613.6,-2738.45,-2306.17,1.05915e-88,1378.77
HSPH1-209 utr3:protein_coding,AUAACCUUAAAUUGGCCUAUUCCUUCAAUUAAUAAAAUAUUUUUGC...,(((((.......((((............................))...,"....({.......||.{{{{(((.,.,....,.,{{{{{,,,,,.....",..................(((((..........................,-141.4,-157.8,-94.57,2.77177e-12,199.08
IARS1-204 utr3:nonsense_mediated_decay,CAAUAGGUUGGACCUUUUAAAGCUGAAGAGUGUUGUCACUAGCAUU...,.((((((..(((((.......(((((((((((....))))((((((...,"....{((,.{{{|||,{,..,(((,{{{,(((((,{{{(,{,({.....",.................................................,-252.7,-270.86,-195.08,1.60827e-13,267.29
LAP3-210 utr3:protein_coding,UUCAGAUACUCAAAAAUGUCUUCACUCUGUCUUAAAUUGGACAGUU...,(((((...((((....(((((((...((((((......)))))).....,(((((...((((....(((((((...((((((......)))))).....,(((((...((((....(((((((...((((((......)))))).....,-66.2,-72.28,-49.56,5.19887e-05,53.81
MAP3K5-201 utr3:protein_coding,CUGUUGCUCAAUCUAAUCUUCGAUGGAAAUUCUAAAAAUUAAUACA...,.((((((((..(((..((((((((.....(((((....((((...(...,".((((({((..(({..{{{{((((,|,..,,{{...,,{{{{,,,{...",.((((((((..(((..((((((((.........................,-153.8,-165.9,-111.7,2.99882e-09,206.17
