In [1]:
import os
import pandas as pd

In [2]:
base_dir = "/mnt/d/ebola/RNAfold1"

In [3]:
def split_sequence(sequence):
    split = sequence.split("\n")
    header = split[0]
    sequence = split[1]

    if len(split[2].split(" ")) == 2:
        oss, oss_mfe = split[2].split(" ")
        oss_mfe = float(oss_mfe[1:-1])
    elif len(split[2].split(" ")) == 3:
        oss, _, oss_mfe = split[2].split(" ")
        oss_mfe = float(oss_mfe[:-1])

    if len(split[3].split(" ")) == 2:
        tde, tde_fe = split[3].split(" ")
        tde_fe = float(tde_fe[1:-1])
    elif len(split[3].split(" ")) == 3:
        tde, _, tde_fe = split[3].split(" ")
        tde_fe = float(tde_fe[:-1])

    if len(split[4].split(" ")) == 3:
        css, css_mfe, _ = split[4].split(" ")
        css_mfe = float(css_mfe[1:])
    elif len(split[4].split(" ")) == 4:
        css, _, css_mfe, _ = split[4].split(" ")
        css_mfe = float(css_mfe[1:])
    elif len(split[4].split(" ")) == 5:
        css, _, _, css_mfe, _ = split[4].split(" ")
        css_mfe = float(css_mfe[1:])

    frequency = float(split[5].strip().split("; ")[0].split(" ")[-1])
    ensemble_diversity = float(split[5].strip().split("; ")[1].split(" ")[-1])
    return (
        header,
        sequence,
        oss,
        tde,
        css,
        oss_mfe,
        tde_fe,
        css_mfe,
        frequency,
        ensemble_diversity,
    )

In [4]:
cds_data = []
utr3_data = []
utr5_data = []

for gene_folder in os.listdir(base_dir):
    gene_folder_path = os.path.join(base_dir, gene_folder)
    if os.path.isdir(gene_folder_path):
        for file_name in os.listdir(gene_folder_path):
            if file_name.endswith(".out"):
                out_file_path = os.path.join(gene_folder_path, file_name)
                with open(out_file_path, "r") as file:
                    content = file.read()
                    sequences = content.split(">")

                    seq1 = split_sequence(sequences[1])
                    seq2 = split_sequence(sequences[2])
                    seq3 = split_sequence(sequences[3])

                    cds_data.append(seq1)
                    utr3_data.append(seq2)
                    utr5_data.append(seq3)

In [5]:
columns = [
    "gene",
    "sequence",
    "optimal_secondary_structure",
    "thermodynamic_ensemble",
    "centroid_secondary_structure",
    "optimal_secondary_structure_mfe",
    "thermodynamic_ensemble_fe",
    "centroid_secondary_structure_mfe",
    "frequency_of_mfe_structure",
    "ensemble_diversity",
]

cds_df = pd.DataFrame(cds_data, columns=columns)
utr3_df = pd.DataFrame(utr3_data, columns=columns)
utr5_df = pd.DataFrame(utr5_data, columns=columns)

# Set the header as the index for each dataframe
cds_df.set_index("gene", inplace=True)
utr3_df.set_index("gene", inplace=True)
utr5_df.set_index("gene", inplace=True)

In [6]:
cds_df.to_csv("cds_RNAfold_not_found.csv")
utr3_df.to_csv("utr3_RNAfold_not_found.csv")
utr5_df.to_csv("utr5_RNAfold_not_found.csv")

In [7]:
utr3_df

Unnamed: 0_level_0,sequence,optimal_secondary_structure,thermodynamic_ensemble,centroid_secondary_structure,optimal_secondary_structure_mfe,thermodynamic_ensemble_fe,centroid_secondary_structure_mfe,frequency_of_mfe_structure,ensemble_diversity
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BCL7A-201 utr3:protein_coding,ACGAUGCUUUAAAGCCUCCGAUCCAUGUUCCAUGGAAGGUACAUCA...,..((((((....)))......((((((...))))))((((.(((((...,"..{,((((.....((((....((((((...)))))))))).....}...",.....(((.....((((....((((((...)))))))))).....)...,-969.88,-1017.75,-728.45,1.8627399999999999e-34,750.89
CCND3-203 utr3:protein_coding,CCCUGGAGAGGCCCUCUGGAGUGGCCACUAAGCAGAGGAGGGGCCG...,.(((((((.((((((....)).)))).....((((.((((((((.....,.(((((((.((((((....)).)))).....((((.((((((((.....,.(((((((.((((((....)).)))).....((((.((((((((.....,-358.5,-373.56,-287.2,2.42506e-11,265.57
SAMHD1-221 utr3:protein_coding,AUGUCUGUAGUCAGUUGUUUACAAACUCCCUCUCCUGCACAAUUCA...,...((((.(((((((((((.......(((((..(((....(((((....,"..,,,{{,,(.((((((((.......(((((..(((....(((((....",...........((((((((.......(((((..(((....(((((....,-918.51,-965.71,-733.81,5.47125e-34,766.85
STAT1-201 utr3:protein_coding,AGCAUGAAUUUUUUUCAUCUUCUCUGGCGACAGUUUUCCUUCUCAU...,.((((((..........(((((....((..((((((((((((.......,".((((((.......,,.(((((...,{{..((((((((((((.......",.((((((...................((..((((((((((((.......,-362.67,-393.06,-287.54,3.8276000000000003e-22,374.71
UBXN1-202 utr3:protein_coding,GGGCCUUUGUCCCAUUGUCCCUCUGUGACCCCUUCAUCUUUGAUAA...,((((....))))...((((.....(((......(((....)))......,"((((....},)),..{{{{.....(((......(((....)))......",...(....).......((......(((......(((....)))......,-9.9,-11.53,0.8,0.0711062,16.92
ZC3HAV1-201 utr3:protein_coding,AACCGAUGAAUACAGCGUCAGAAGGAUGCCAUAACCAUUCUGUUCC...,..((((((.((.(((.((..((((.((((........((((((......,"..((((((,((,(((,((,.((((,{(((.,......{{((((......",..(((((.....(((.((....((.((((.........(((((......,-1043.2,-1118.79,-766.66,5.39152e-54,1091.04


In [8]:
cds_df

Unnamed: 0_level_0,sequence,optimal_secondary_structure,thermodynamic_ensemble,centroid_secondary_structure,optimal_secondary_structure_mfe,thermodynamic_ensemble_fe,centroid_secondary_structure_mfe,frequency_of_mfe_structure,ensemble_diversity
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BCL7A-201 cds:protein_coding,AUGUCGGGCAGGUCGGUUCGAGCCGAGACGAGGAGCCGGGCCAAAG...,..(((.(((.((.....))..)))..))).....((((((((...(...,"{((((..((.((((((((((((((.{((({....((((((((...,...",(((((..((.((((((((((((((.(((((...................,-195.0,-203.9,-145.65,5.31484e-07,185.04
CCND3-203 cds:protein_coding,AUGGAGCUGCUGUGUUGCGAAGGCACCCGGCACGCGCCCCGGGCCG...,..((((.(((((.(((((...(((.((.(((..((((((((((..(...,..((((.(((((.(((((...(((.((.(((..((((((((((..(...,..((((.(((((.(((((...(((.((.(((..((((((((((..(...,-366.5,-378.57,-318.21,3.10714e-09,181.02
SAMHD1-221 cds:protein_coding,AUGCAGCGAGCCGAUUCCGAGCAGCCCUCCAAGCGUCCCCGUUGCG...,.....((((((....((((((.(((.(((((((.(((.((.(((((...,"..,{,((,,{{,,..,,{{((|((({((((..,.((({.,{{((((...",.........................(((((...............(...,-506.1,-539.09,-338.39,5.6502899999999996e-24,495.43
STAT1-201 cds:protein_coding,AUGUCUCAGUGGUACGAACUUCAGCAGCUUGACUCAAAAUUCCUGG...,.(((((((...((.((((((.(((.(((((.(((..((((((((((...,".{((((((...((.((((((.(((.(((((.{(((,((((((((((...",.((.((((...((.((((((.(((.(((((.((.(..(((((((((...,-667.0,-705.03,-478.98,1.5981300000000001e-27,544.2
UBXN1-202 cds:protein_coding,AUGGCGGAGCUGACGGCUCUUGAGAGUCUCAUCGAGAUGGGCUUCC...,...((((((.(((.(((((....))))))))......((.((.(((...,"..,{.{{((((,..{{((({((,{{{{|||...}}}.,{|((|({{...",.................................................,-340.41,-353.24,-225.1,9.11696e-10,254.09
ZC3HAV1-201 cds:protein_coding,AUGGCGGACCCGGAGGUGUGCUGCUUCAUCACCAAAAUCCUGUGCG...,(((((((((((((.((((((......)).)))).......((((((...,"{((((((((((((.(((({(......}}.)))).....,(((((((...",(((((((((((((.((((((......)).))))................,-842.0,-886.49,-697.97,4.4856e-32,514.57


In [9]:
utr5_df

Unnamed: 0_level_0,sequence,optimal_secondary_structure,thermodynamic_ensemble,centroid_secondary_structure,optimal_secondary_structure_mfe,thermodynamic_ensemble_fe,centroid_secondary_structure_mfe,frequency_of_mfe_structure,ensemble_diversity
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BCL7A-201 utr5:protein_coding,GCACUGGGCCAGGCGCGCGGCGGCCCCGGGCUUUGUGUGUGUGUGU...,((((...((((.((((((((.((((...)))))))))))).)).))...,"((((.{.((((.((((((({.{((,.{{,||,.,,,))}),}),}}...",.................................................,-81.2,-85.18,-44.8,0.001556,80.2
CCND3-203 utr5:protein_coding,GUCAGGGAAGCGGCGCGCGCGCGCGGGCGGCGGGCGGGCUGGGGAU...,.....((..((((((.(((.(((.(((((((((((((((((((((....,"..,.{((..((((((.(((.(((.(((((((((((((((((((((....",....((...((((((.(((.(((.(((((((((((((((((((((....,-79.6,-81.79,-74.3,0.02879,37.65
SAMHD1-221 utr5:protein_coding,AGUGCGCCUGCGCGCGGGUCCGGCGCCGAGGUUCUUGACUGCUGUG...,..((((((((.((....(((((((((...((((...))))...)))...,"..((((((((.((,...{((((((((...((((...))))...)))...",..((((((((.((....(((((((((...((((...))))...)))...,-33.4,-34.29,-33.4,0.237647,6.6
STAT1-201 utr5:protein_coding,GUUUCGCUUUCCUGCGCAGAGUCUGCGGAGGGGCUCGGCUGCACCG...,.....(((((.((((((..((....((((((((((.(((..........,".....{{{{,.{{((((((.(((..((((((((((.(((......,...",.............((((((.(((..((((((((((.(((..........,-129.0,-134.66,-109.1,0.000102,55.1
UBXN1-202 utr5:protein_coding,CUUCUCGUCGGUGUUCCCGGCUGCUAUAGAGCCGGGUGAGAGAGCG...,.........(((((..((((((.......))))))(((((((((.....,".....,,,,(((((.,((((((.......))))))(((((((({.,...",.........(((((..((((((.......))))))((((((((......,-67.9,-70.05,-65.2,0.030581,22.78
ZC3HAV1-201 utr5:protein_coding,GCUUUUAGUUUCUCUUCUUUCUAAAGAAGGCUCGCGGAGCCCGGCU...,((............(((((....)))))(((((((((.((...(((...,"{{,..........,(((({....)))))|(({(((((.((...(((...",..............(((((....)))))(((((((((.((...(((...,-146.1,-151.44,-137.4,0.000173,60.45
