In [15]:
from pathlib import Path
import pandas as pd

In [18]:
root = Path("../output_yeast_post_threshold/structures/")
types = [path.name for path in root.iterdir() if path.is_dir()]
struct_aliases = {
    "VS": "insilico",
    "VC": "consensus",
    "VG": "consensus_guided",
}

df = pd.DataFrame()
for type in types:
    for file in (root / type).iterdir():
        if not file.suffix == ".tsv":
            continue
        _df = pd.read_csv(file, sep="\t")
        _df = _df.melt(
            id_vars=["gene"],
            var_name="structure",
            value_name="score"
        )
        _df["type"] = type
        df = pd.concat([df, _df])

df["IA"] = 0.0
df["CS"] = 0.0

for type in types:
    for file in (root / type).iterdir():
        if file.is_file():
            continue
        with open(file / "constrained.fasta") as f:
            lines = f.readlines()
        seq = lines[1].strip()
        con = lines[2].strip()
        ia = con.count("x") / len(seq)
        df.loc[df["gene"] == file.name, "IA"] = ia

new_rows = []
for row in df.iterrows():
    row = row[1]
    mfe_path = root / row.type / row.gene / "vienna" / \
        struct_aliases[row.structure] / "MFEs.txt"
    con_path = root / row.type / row.gene / "constrained.fasta"
    with open(con_path) as f:
        lines = f.readlines()
    seq = lines[1].strip()
    con = lines[2].strip()
    con = con.replace(".", "?").replace("x", ".")
    with open(mfe_path) as f:
        lines = f.readlines()
    lines = [line for line in lines if not line.startswith(">")]
    mfe = lines[1].split(" ")[0]
    nu = con.count(".")
    nc = sum([int(c == m) for c, m in zip(con, mfe)])
    np = seq.count("C") + seq.count("A")
    row.CS = (nu * nc) / (np * (nu + 1))
    new_rows.append(row)

df = pd.DataFrame(new_rows)
df.to_csv(root / "combined.csv", index=False)