In [6]:
#----------GMMA00---------------

import pandas as pd

# Set options
pd.set_option("display.width", 200)
pd.set_option("display.precision", 4)

# Read data from file
file = "/Users/liza/Desktop/Bioinfo Project/amino_acid_genotypes_to_brightness.tsv"
print("\nReading", file)
d = pd.read_csv(file, sep="\t", header=None, skiprows=1, names=["seq", "N", "signal", "sd"])

# Split mutations
mut_list = d["seq"].str.split(":")
mut_list = mut_list.drop(index=0)
# Remove leading "S" from substitutions
mut_list = mut_list.apply(lambda x: [m[1:] for m in x])

# Combine substitutions into a single string
subst = mut_list.apply(lambda x: ":".join(x))
#wieder wie vorher zusammenschreiben, nur ohne S vor jeder Mut
subst[0] = "WT"

# Create data frame
df = pd.DataFrame({"variant": subst, "bright": d["signal"], "n_syn": d["N"], "std": d["sd"]})

# Write data to file
outfile = "amino_acid_genotypes_to_brightness_parsed.tsv"
df.to_csv(outfile, sep="\t", index=False, header=False)

print(f"Dumped {outfile} with {len(subst)} substitutions and {len(mut_list)} variants")

#bereitet die original datei vor, indem es die erste zeile (wt) anders macht und die S entfernt und einen neuen df macht


Reading /Users/liza/Desktop/Bioinfo Project/amino_acid_genotypes_to_brightness.tsv
Dumped amino_acid_genotypes_to_brightness_parsed.tsv with 54025 substitutions and 54024 variants


In [8]:
#----------GMMA01---------------

# es wird mit dem output des Codes von GMMA00 weitergearbeitet
file = "amino_acid_genotypes_to_brightness_parsed.tsv"

print("Reading", file)

# Read data
try:
    d = pd.read_table(file, header=None)
except FileNotFoundError:
    print("Error: File not found.")
    exit()

# Check if the first row is a label
if isinstance(d.iloc[0, 0], str):
    wt_seq = ""
else:
    wt_seq = d.iloc[0, 0].lower()

# Remove first row (wild type or label)
d = d.iloc[1:]

# Wild-type average log brightness, number of WT measurements, and standard deviation
d.columns = ["seq", "N", "signal", "sd"]
wt = d.iloc[0].copy()
wt["seq"] = ""

print("Making data structures")

# Settings
settings = {}
settings["taa_letters"] = 1

# Per mutant data
def split_mut_list(seq):
    if ":" in seq:
        return seq.split(":")
    else:
        return [seq]

mut_list = d["seq"].fillna('').apply(split_mut_list)

mutant = pd.DataFrame(
    {
        "i": range(1, len(mut_list) + 1),
        "signal": d["signal"],
        "N_sub": [len(sub) for sub in mut_list],
        "N_obs": d["N"],
        "sd_obs": d["sd"],
    }
)
mutant.index = ["mut{:05d}".format(i) for i in range(1, len(mut_list) + 1)]
mutant["subst"] = mut_list.str.join(":")

# Per substitution data
subst_table = pd.Series(mut_list.explode()).value_counts()
nsubst = len(subst_table)
#die Positionen zählen
#subst_table = Positionen der Mutationen
## ?Zählen wieviele Mutationen?

# Make data structure of substitutions
subst = pd.DataFrame({"subst_table": subst_table})
subst["resi"] = 0
subst["taa"] = ""
subst["obs"] = subst["subst_table"]
subst["signal"] = pd.NA
subst.index.name = "Var1"
subst.reset_index(inplace=True)
subst = subst.sort_values("Var1")
subst["i"] = range(1, nsubst + 1)

# Assign residue and taa values
for si in range(1, nsubst + 1):
    m = subst["Var1"].iloc[si - 1]
    if m and m != '':
        subst.at[si, "resi"] = m[1:-settings["taa_letters"]]
        subst.at[si, "taa"] = m[-settings["taa_letters"]:]
#die
# Per residue data
residue = pd.DataFrame(list(wt_seq), columns=["wt"])
residue["subst"] = ""
residue["N_mut"] = pd.NA

# Assign substitutions
for ri in range(1, len(residue) + 1):
    residue.at[ri, "subst"] = subst.loc[subst["resi"] == ri, "taa"].str.cat(sep="")

    si = subst.index[subst["resi"] == ri]
    residue.at[ri, "N_mut"] = subst.loc[si, "obs"].sum()

# Build index translation lists between data frames
res_mut_indices = [mutant.index[mutant["subst"].str.contains(":{}:".format(i))] for i in range(1, len(residue) + 1)]
mut_subst_indices = [subst.index[subst["Var1"].isin(sub)] for sub in mut_list]
subst_mut_indices = [np.where(mutant["subst"].str.contains(":{}:".format(subst.loc[si, "Var1"])))[0] for si in range(nsubst)]




print("")
print("Building subst_mut_indices")
print("done building subst_mut_indices")

print("Successfully generated indexed data structures of {} variants carrying {} unique substitutions".format(
    len(mut_list), nsubst))

#hat glaube ich geklappt: 54024 (im csv file: eine Zeile head und eine wt) variants mit 1879 (stimmt) substitutions

Reading amino_acid_genotypes_to_brightness_parsed.tsv
Making data structures

Building subst_mut_indices
done building subst_mut_indices
Successfully generated indexed data structures of 54024 variants carrying 1879 unique substitutions


In [None]:
#----------GMMA02---------------