In [1]:
import os
import pandas as pd

In [2]:
base_dir = "/mnt/d/ebola/data/RNAfold/RNAfold_found_salt_corrected"

In [3]:
def split_sequence(sequence):
    split = sequence.split("\n")
    header = split[0]
    sequence = split[1]

    if len(split[2].split(" ")) == 2:
        oss, oss_mfe = split[2].split(" ")
        oss_mfe = float(oss_mfe[1:-1])
    elif len(split[2].split(" ")) == 3:
        oss, _, oss_mfe = split[2].split(" ")
        oss_mfe = float(oss_mfe[:-1])

    if len(split[3].split(" ")) == 2:
        tde, tde_fe = split[3].split(" ")
        tde_fe = float(tde_fe[1:-1])
    elif len(split[3].split(" ")) == 3:
        tde, _, tde_fe = split[3].split(" ")
        tde_fe = float(tde_fe[:-1])

    if len(split[4].split(" ")) == 3:
        css, css_mfe, _ = split[4].split(" ")
        css_mfe = float(css_mfe[1:])
    elif len(split[4].split(" ")) == 4:
        css, _, css_mfe, _ = split[4].split(" ")
        css_mfe = float(css_mfe[1:])
    elif len(split[4].split(" ")) == 5:
        css, _, _, css_mfe, _ = split[4].split(" ")
        css_mfe = float(css_mfe[1:])

    frequency = float(split[5].strip().split("; ")[0].split(" ")[-1])
    ensemble_diversity = float(split[5].strip().split("; ")[1].split(" ")[-1])
    return (
        header,
        sequence,
        oss,
        tde,
        css,
        oss_mfe,
        tde_fe,
        css_mfe,
        frequency,
        ensemble_diversity,
    )

In [9]:
cds_data = []
utr3_data = []
utr5_data = []

for gene_folder in os.listdir(base_dir):
    gene_folder_path = os.path.join(base_dir, gene_folder)
    if os.path.isdir(gene_folder_path):
        for file_name in os.listdir(gene_folder_path):
            if file_name.endswith(".out"):
                out_file_path = os.path.join(gene_folder_path, file_name)
                with open(out_file_path, "r") as file:
                    content = file.read()
                    sequences = content.split(">")

                    if len(sequences) != 4:
                        seq1 = split_sequence(sequences[1])
                        cds_data.append(seq1)
                    else:
                        seq1 = split_sequence(sequences[1])
                        seq2 = split_sequence(sequences[2])
                        seq3 = split_sequence(sequences[3])

                        cds_data.append(seq1)
                        utr3_data.append(seq2)
                        utr5_data.append(seq3)

In [10]:
columns = [
    "gene",
    "sequence",
    "optimal_secondary_structure",
    "thermodynamic_ensemble",
    "centroid_secondary_structure",
    "optimal_secondary_structure_mfe",
    "thermodynamic_ensemble_fe",
    "centroid_secondary_structure_mfe",
    "frequency_of_mfe_structure",
    "ensemble_diversity",
]

cds_df = pd.DataFrame(cds_data, columns=columns)
utr3_df = pd.DataFrame(utr3_data, columns=columns)
utr5_df = pd.DataFrame(utr5_data, columns=columns)

# Set the header as the index for each dataframe
cds_df.set_index("gene", inplace=True)
utr3_df.set_index("gene", inplace=True)
utr5_df.set_index("gene", inplace=True)

In [11]:
cds_df.to_csv("cds_RNAfold.csv")
utr3_df.to_csv("utr3_RNAfold.csv")
utr5_df.to_csv("utr5_RNAfold.csv")

In [12]:
utr3_df

Unnamed: 0_level_0,sequence,optimal_secondary_structure,thermodynamic_ensemble,centroid_secondary_structure,optimal_secondary_structure_mfe,thermodynamic_ensemble_fe,centroid_secondary_structure_mfe,frequency_of_mfe_structure,ensemble_diversity
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AKT3-217 utr3:protein_coding,GUCUCUUUCAUUCUGCUACUUCACUGUCAUCUUCAAUUUAUUACUG...,.............((..((((.((((((((.((((.........))...,".............{(,,((({.,,,(((((.((((.........))...",.........................(((((.((((.........))...,-1292.15,-1338.85,-985.44,1.22931e-33,1364.99
CD9-201 utr3:protein_coding,AGUCAGCUUACAUCCCUGAGCAGGAAAGUUUACCCAUGAAGAUUGG...,..((((.........)))).(((((.............((((((((...,"..,(((,..{,,....,{(((((((({.,,,,((,((....,,.})...",..................(((((((........................,-75.34,-81.47,-32.29,4.80853e-05,140.58
CLIP1-203 utr3:protein_coding,UGAAGCCUCCAGUGGAGAACUGGGCUUGCUCAGACGCACUCGCAUU...,..(((((((((((.....)))))....(((((((.((....))......,"..,((((.{((((,,,,.}|||||,.,{|,||,..||,,,.|}|,....",....(((.(((((.....)))))..........................,-246.5,-262.93,-156.51,2.64193e-12,383.0
EHD4-201 utr3:protein_coding,GGGGUGGGCUGCAGAACGGGGUGGGAACUGGGGGACCUGGGCCUCA...,.(((.(((((((((..((((((((((..((((((((((((((((.....,.(((.(((((((((..((((((((((..((((((((((((((((.....,.(((.(((((((((..((((((((((..((((((((((((((((.....,-1474.57,-1516.38,-1197.71,3.48161e-30,959.59
EIF2AK2-201 utr3:protein_coding,AGCCCUUCUGAAAAAGUAUCCUGCUUCUGAUAUGCAGUUUUCCUUA...,.....((((((...(((.((((((((((((.((((((((.....((...,".....((((((...{{{.((((((((((,.,,,((((,...........",.....((((((....((.((((((.........................,-2359.79,-2411.34,-2040.68,4.70474e-37,1312.43
HSPH1-209 utr3:protein_coding,AUAACCUUAAAUUGGCCUAUUCCUUCAAUUAAUAAAAUAUUUUUGC...,................(((((((..........................,"....,.......,||,{{(((((.............,,,..,,,,....",................(((((((..........................,-119.95,-129.95,-92.9,9.05371e-08,164.26
IARS1-204 utr3:nonsense_mediated_decay,CAAUAGGUUGGACCUUUUAAAGCUGAAGAGUGUUGUCACUAGCAUU...,....((((...)))).....((((.((((((((((....)))))))...,"....{{((,,,|}||,,,..,(((,{{{{((((({,,,,}|}||}}...",..............................(((........))).....,-214.02,-224.04,-149.85,8.67956e-08,255.85
LAP3-210 utr3:protein_coding,UUCAGAUACUCAAAAAUGUCUUCACUCUGUCUUAAAUUGGACAGUU...,(((((...((((....(((((((...((((((......)))))).....,(((((...((((....(((((((...((((((......)))))).....,(((((...((((....(((((((...((((((......)))))).....,-54.49,-57.93,-41.19,0.00375133,52.72
MAP3K5-201 utr3:protein_coding,CUGUUGCUCAAUCUAAUCUUCGAUGGAAAUUCUAAAAAUUAAUACA...,.((((((((..(((..((((((((.........................,".(((((,,,..{{,..,,,,(((({{{.........{{{{{{{{{(...",.(((((..............................((((((((((...,-129.29,-135.85,-80.08,2.39854e-05,204.42
MEX3A-202 utr3:protein_coding,GCCCCGUGCCCCAUGCCUCCGGGGCCCACUCCACUGGGCCCACCCU...,..(((...(((((((((((..(((((((......)))))))........,".,(({...(((((((((((..(((((((......))))))).,,,,...",..(((...(((((((((((..(((((((......)))))))........,-1522.65,-1559.49,-1196.59,1.1060799999999999e-26,-89553550000000.0
