<a href="https://colab.research.google.com/github/dyl4nm4rsh4ll/funsae/blob/master/data_prep_cont.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# settings

In [None]:
import copy, itertools, json, os, pickle, sys, time

In [None]:
from matplotlib import cm, colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import special
from scipy import signal as sig
from scipy.cluster.hierarchy import linkage, dendrogram, fclusterdata
from scipy.spatial.distance import jensenshannon, pdist, squareform, hamming
import scipy.stats as stats
import scipy.io as sio

In [None]:
aa_babel = {
  "alphabet": "ARNDCQEGHILKMFPSTWYV-",
  "tri_alphabet": [
    "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY",
    "HIS", "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "SER",
    "THR", "TRP", "TYR", "VAL", "GAP"
  ]
}
aa_babel["a2i"] = {a: i for i, a in enumerate(aa_babel["alphabet"])}
aa_babel["i2a"] = {i: a for i, a in enumerate(aa_babel["alphabet"])}
aa_babel["tri2a"] = dict(zip(aa_babel["tri_alphabet"], aa_babel["alphabet"]))
aa_babel["tri2i"] = {aaa: i for i, aaa in enumerate(aa_babel["tri_alphabet"])}

In [None]:
plt.style.use("default")

# methods

In [None]:
def con_mtx(f, chains, tri2a, rs, rs_ng, X_ng, diff_max):
  """confind output to contact matrix"""
  # measured contacts dataframe
  try: df = pd.read_csv(f, sep="\t", header=None, names=["kind", "i", "j", "con", "ia", "ja"])
  except: return None
    # wrt selected chains
  df_c0 = df.copy()[df["i"].str.contains(chains[0]) & df["j"].str.contains(chains[0])]
  df_c0["i"] = df_c0["i"].apply(lambda x: int(x.split(",")[1]))
  df_c0["j"] = df_c0["j"].apply(lambda x: int(x.split(",")[1]))
  # homo- and hetero- dimers
  if len(chains) == 2:
    c0c1_offset = df_c0[["i", "j"]].astype("int").max().max()
    df_c1 = df.copy()[
      (df["i"].str.contains(chains[1]) & df["j"].str.contains(chains[0])) |
      (df["i"].str.contains(chains[0]) & df["j"].str.contains(chains[1])) |
      (df["i"].str.contains(chains[1]) & df["j"].str.contains(chains[1]))
    ]
    df_c1["i"] = df_c1["i"].apply(lambda x: int(x.split(",")[1]))
    df_c1["j"] = df_c1["j"].apply(lambda x: int(x.split(",")[1]))
    df = pd.concat([df_c0, df_c1], 0)
  else: df = df_c0
  # clean
  df["ia"] = df["ia"].apply(lambda x: tri2a[x]).astype("str")
  df["ja"] = df["ja"].apply(lambda x: tri2a[x]).astype("str")

  # collate sequences / indices
    # reference indices / sequence
  rs_a = np.array(list("".join(rs).upper()))
  rs_i = np.arange(len(rs_a))
    # x-tal indices / sequence
  xt_i_a = {**dict(zip(df["i"], df["ia"])), **dict(zip(df["j"], df["ja"]))}
  xt_a = "".join(list(xt_i_a.values()))
  xt_i = np.sort(list(xt_i_a.keys()))
    # missing indices, sort
  xt_present = np.array(xt_i)
  xt_missing = np.setdiff1d(np.arange(np.max([np.max(xt_i), np.max(rs_i)])+1), xt_present)
  xt_i_a.update({i: "-" for i in xt_missing})
  xt_i_a = {i: xt_i_a[i] for i in np.sort(list(xt_i_a.keys()))}
    # x-tal clean
  xt_a = np.array(list(xt_i_a.values()))
  xt_i = np.sort(np.array(list(xt_i_a.keys())))
    # map
  xi2ri, xia2ria = map_idx(xt_a, xt_i, xt_missing, rs_a, rs_i, diff_max)
  
  # measured contacts
  con = np.zeros((len(rs_a), len(rs_a)))
  df["i"], df["j"] = df["i"].map(xi2ri), df["j"].map(xi2ri)
  df = df[df["i"].notnull() & df["j"].notnull()]
  for i, j, val in zip(df["i"], df["j"], df["con"]):
    con[int(i), int(j)] += val
    con[int(j), int(i)] += val

  # collate, collect
  con = con[rs_ng, :][:, rs_ng][:, X_ng][X_ng, :]
    # index matching... wow
  con2rs = np.array(list(xia2ria.items()))
  con2rs_ng = con2rs[np.in1d(np.array([int(x[:-1]) for x in con2rs[:, 1]]), rs_ng)]
  con2rs_ng[:, 1] = [str(i)+a[-1] for i, a in enumerate(con2rs_ng[:, 1])]
  con2X_ng = con2rs_ng[np.in1d(np.array([int(x[:-1]) for x in con2rs_ng[:, 1]]), X_ng)]
  con2X_ng[:, 1] = [str(i)+a[-1] for i, a in enumerate(con2X_ng[:, 1])]
  return {"con2X_ng": con2X_ng, "con": con}

In [None]:
def ds_dms(rs, rs_ng, X_ng, dY, a2i, diff_max):
  """DMS data wrt DeepSequence"""
  # clean
  dY_cols = {
    "mutant": "muts",
    "mutation_effect_prediction_independent": "ds.sw",
    "mutation_effect_prediction_pairwise": "ds.pw",
    "mutation_effect_prediction_vae_1": "ds.v1",
    "mutation_effect_prediction_vae_2": "ds.v2",
    "mutation_effect_prediction_vae_3": "ds.v3",
    "mutation_effect_prediction_vae_4": "ds.v4",
    "mutation_effect_prediction_vae_5": "ds.v5",
    "mutation_effect_prediction_vae_ensemble": "ds.vE"
  }
  dY_cols.update({c: "dms."+c for c in dY.columns if c not in dY_cols.keys()})
  dY = dY[[c for c in dY.columns if "Unnamed" not in c]].rename(columns=dY_cols)
  dY = dY[dY["muts"].str.count(":")+1 <= diff_max]
  dY = dY[~dY["muts"].str.contains("wt")]

  # reference sequence / indices / tensor
  rs_a = np.array(list("".join(rs).upper()))
  rs_i = np.arange(len(rs_a))
  rX = np.zeros((len(rs_a), len(a2i)))
  for i, a in enumerate(rs_a): rX[i, a2i[a.upper()]] = 1

  # collate dms
  muts = dY["muts"]
  num_muts = muts.str.count(":")
  Nw_muts = np.concatenate(muts.apply(lambda x: [y[:-1] for y in x.split(":")]).values)
  dms_i_a = {int(x[1:]): x[0] for x in Nw_muts}
  dms_i = np.sort(list(dms_i_a.keys()))
    # missing indices, sort
  dms_present = np.array(dms_i)
  dms_missing = np.setdiff1d(np.arange(1, np.max(dms_i)+1), dms_present)
  dms_i_a.update({i: "-" for i in dms_missing})
  dms_i_a = {i: dms_i_a[i] for i in np.sort(list(dms_i_a.keys()))}
    # sequence / indices / index mapping
  dms_i = np.sort(list(dms_i_a.keys()))
  dms_a = np.array([dms_i_a[i] for i in dms_i])
  di2ri, dia2ria = map_idx(dms_a, dms_i, dms_missing, rs_a, rs_i, diff_max)
  
  # dms tensors, rs vs dms mismatches
  dX, errors = [], []
  for mut in muts:
    tmp, error = rX.copy(), 0
    # N-wise order
    for m in mut.split(":"):
      i = di2ri[int(m[1:-1])]
      # wildtypes
      rs_wt_j = a2i[rs_a[i]]
      dms_wt_j = a2i[m[0]]
      if rs_wt_j != dms_wt_j: error += 1
      mut_j = a2i[m[-1]]
      tmp[i, rs_wt_j], tmp[i, mut_j] = 0, 1
    dX.append(tmp)
    errors.append(error)

  # collate
  dX = np.stack(dX)[:, rs_ng, :][:, X_ng, :]
  dY["seq_err"] = errors
  dY["hamm"] = np.abs(rX[rs_ng, :][None, X_ng, :]-dX).sum((1, 2))/2
    # index matching is one hell of a thing
  dX2rs = np.array(list(dia2ria.items()))
  dX2rs_ng = dX2rs[np.in1d(np.array([int(x[:-1]) for x in dX2rs[:, 1]]), rs_ng)]
  dX2rs_ng[:, 1] = [str(i)+a[-1] for i, a in enumerate(dX2rs_ng[:, 1])]
  dX2X_ng = dX2rs_ng[np.in1d(np.array([int(x[:-1]) for x in dX2rs_ng[:, 1]]), X_ng)]
  dX2X_ng[:, 1] = [str(i)+a[-1] for i, a in enumerate(dX2X_ng[:, 1])]
  return {"dX2X_ng": dX2X_ng, "dX": dX, "dY": dY}

In [None]:
def get_depth(depth, chains, con2X_ng, tri2a):
  """depth of amino acids in structure"""
  raw = pd.read_csv(depth, sep="\t")
  ia2d = dict(zip(
    np.array([x+y for x, y in zip(
      pd.Series(raw.index.values).apply(lambda x: int(x[2:])).astype("str"),
      raw["# chain:residue"].map(tri2a).astype("str")
    )]), raw["all-atom"]
  ))
  X_ng2depth = {}
  for con_ia, X_ng_ia in dict(con2X_ng).items():
    try: X_ng2depth.update({X_ng_ia: ia2d[con_ia]})
    except: X_ng2depth.update({X_ng_ia: np.nan})
  return {"X_ng2depth": np.array(list(X_ng2depth.items()))}

In [None]:
def h2f(x, fcns, out=""):
  """function from header"""
  for f in fcns:
    if f in x: out += f+"\n"
  if len(out) == 0: return "other"
  elif out.count("\n") > 1: return "multiple"
  else: return out.rstrip("\n")

In [None]:
def make_msa(seqs, a2i, gap=.5, Neff=.8):
  """construct Multiple Sequence Alignment (MSA)"""
  raw = np.array([[a2i[a] for a in seq] for seq in seqs])  
  non_gap = np.where(np.mean((raw==a2i["-"]).astype(np.float), 0)<gap)[0]
  # non-gapped columns, raw msa, effective sequence weights
  return {
    "X_ng": non_gap,
    "X": np.eye(len(a2i))[raw],
    # calculated wrt "clean" indices
    "W": 1.0/np.sum(1.0*((1.0-squareform(pdist(raw[:, non_gap], "hamming")))>=Neff), -1)
  }

In [None]:
def map_idx(za, zi, missing, ra, ri, diff_max):
  """map a.a. (za) & indices (zi) to reference (ra, ri)"""
  
  def _z2r(zi_tmp, ri_tmp, za_tmp, ra_tmp):
    """map seq to ref"""
    zia = [str(i)+a for i, a in zip(zi_tmp, za_tmp)]
    ria = [str(i)+a for i, a in zip(ri_tmp, ra_tmp)]
    return dict(zip(zi_tmp, ri_tmp)), dict(zip(zia, ria))

  def _hamm(length, ra_tmp, za_tmp):
    """hamming distance"""
    weights = np.ones(length)
    za_missing = np.where(za_tmp == "-")[0]
    if len(za_missing) > len(za_tmp)/2: return np.inf
    weights[za_missing] = 1e-7
    return int(length*hamming(ra_tmp, za_tmp, weights))

  # z subset r
  if len(za) < len(ra):
    l, L = len(za), len(ra)
    for idx in np.arange(L):
      x, y = idx, idx+l
      ra_tmp, ri_tmp = ra[x:y], ri[x:y]
      length = len(ra_tmp)
      if length < diff_max: break
      za_tmp, zi_tmp = za[:length], zi[:length]
      if _hamm(length, ra_tmp, za_tmp) <= diff_max:
        zi2ri, zia2ria = _z2r(zi_tmp, ri_tmp, za_tmp, ra_tmp); break
  # r subset z
  else: # (len(za) > len(ra)) or (len(za) == len(ra))
    l, L = len(ra), len(za)
    for idx in np.arange(L):
      x, y = idx, idx+l
      za_tmp, zi_tmp = za[x:y], zi[x:y]
      length = len(za_tmp)
      if length < diff_max: break
      ra_tmp, ri_tmp = ra[:length], ri[:length]
      if _hamm(length, ra_tmp, za_tmp) <= diff_max:
        zi2ri, zia2ria = _z2r(zi_tmp, ri_tmp, za_tmp, ra_tmp); break
  # ensure validity
  try: return zi2ri, zia2ria
  except: assert False, "\nra='"+"".join(ra)+"'\nza='"+"".join(za)+"'"

In [None]:
def parse_af(fname, alphabet, h_count=0, rs=""):
  """clean / parse an alignment file to headers & sequences"""
  # define, open
  P = {"h": [], "s": []}
  with open(fname, "r") as lines:
    # parse
    for l, L in enumerate(lines):
      L = L.rstrip()
      try:
        # header
        if L.startswith(">"):
          h_count += 1
          P["h"].append(L[1:])
          P["s"].append([])
        # sequence
        else: P["s"][-1].append("".join([l for l in L if l in alphabet]))
        if h_count == 1 and l > 0: rs += L
      except: continue
    # collate, indices w/ length == mode lengths
    seqs = np.array(["".join(s) for s in P["s"]])
    seq_lengths = np.array([len(s) for s in seqs])
    idx = np.where(seq_lengths == stats.mode(seq_lengths)[0][0])[0]
    # collate
    return {
      # headers, sequences
      "h": np.array(P["h"])[idx], "s": seqs[idx],
      # ref. seq., non_gapped ref. seq.
      "rs": np.array(list(rs)),
      "rs_ng": np.array([i for i, l in enumerate(rs) if l in alphabet])
    }

# data prep

In [None]:
# settings
F_dir = "/home/jupyter-dylan/data/"

In [None]:
# # supplementals
# ds_supp = {
#   1: pd.read_excel(F_dir+"/supplementals/41592_2018_138_MOESM3_ESM.xlsx", None),
#   2: pd.read_excel(F_dir+"/supplementals/41592_2018_138_MOESM4_ESM.xlsx", None),
#   3: pd.read_excel(F_dir+"/supplementals/41592_2018_138_MOESM5_ESM.xlsx", None),
#   4: pd.read_excel(F_dir+"/supplementals/41592_2018_138_MOESM6_ESM.xlsx", None),
#   5: pd.read_excel(F_dir+"/supplementals/41592_2018_138_MOESM7_ESM.xlsx", None),
#   6: pd.read_excel(F_dir+"/supplementals/41592_2018_138_MOESM8_ESM.xlsx", None),
#   7: pd.read_csv(F_dir+"/supplementals/41592_2018_138_MOESM9_ESM.csv"),
#   8: pd.read_excel(F_dir+"/supplementals/41592_2018_138_MOESM10_ESM.xlsx", None),
#   9: pd.read_excel(F_dir+"/supplementals/41592_2018_138_MOESM11_ESM.xlsx", None)
# }
# np.save(F_dir+"supplementals/ds_supp", ds_supp)

In [None]:
ds_supp = np.load(F_dir+"DeepSeq/supplementals/ds_supp.npy", allow_pickle=True).item()

In [None]:
[print(b["mutant"].str.contains(":").sum(), "\t", len(b), "\t", a) for a, b in ds_supp[2].items()];

In [None]:
[x for x in list(ds_supp[2].keys()) if "BLAT" in x]

In [None]:
mut_data = ds_supp[2]["BLAT_ECOLX_Tenaillon2013"]

In [None]:
# check
mut_data.head()

In [None]:
# check
mut_data.tail()

In [None]:
mut_data.describe()

In [None]:
mut_data[mut_data["mutant"].str.contains("M184")]

In [None]:
# mutants
muts = mut_data["mutant"]
muts = muts[~muts.str.contains("wt")]
print("N-wise\t#")
[print(str(a+1)+"\t"+str(b)) for a, b in zip(*np.unique(muts.str.count(":").values, return_counts=True))];
# index, amino acid
i_a = {int(x[1:]): x[0] for x in muts[~muts.str.contains(":")].apply(lambda x: x[:-1]).values.astype("str")}
if muts.str.contains(":").sum() > 0:
  i_a.update({int(x[1:-1])-1: x[0] for x in ":".join(muts[muts.str.contains(":")].values).split(":")})
# missing indices
i_a.update({i: "-" for i in np.setdiff1d(np.arange(np.max(list(i_a.keys()))+1), list(i_a.keys()))})
# sort
i_a = {i: i_a[i] for i in np.sort(list(i_a.keys()))}
# DMS sequence
print("\nDMS seq:\n"+"".join(list(i_a.values())))

**only structured proteins**

In [None]:
# DeepSeq Fig 3
ds_supp[3]["S3_all_predictions_to_statitics"]\
  [ds_supp[3]["S3_all_predictions_to_statitics"]["present_fig3"] == "Yes"]\
    [["dataset_name", "publication_name", "experiment_name"]]#.tail(10)

Unnamed: 0,dataset_name,publication_name,experiment_name
1,AMIE_PSEAE,Aliphatic amide hydrolase: Whitehead 2017,isobutyramide_normalized_fitness
4,B3VI55_LIPSTSTABLE,Levoglucosan kinase: Whitehead 2015,SelectionTwo
5,BF520_ENV,HIV env protein (BF520): Bloom 2018,fitness
6,BG_STRSQ,beta-glucosidase: Abate 2015,enrichment
7,BG505_ENV,HIV env protein (BG505): Bloom 2018,fitness
14,BLAT_ECOLX_Ranganathan2015,beta-lactamase: Ranganathan 2015,2500
30,BRCA_HUMAN_BRCT,BRCA (BRCT domain): Shendure 2018,function_score
31,BRCA_HUMAN_RING,BRCA (Ring domain): Shendure 2018,function_score
32,CALM1_HUMAN,Calmodulin-1: Roth 2017,screenscore
33,DLG,DLG: Ranganathan 2012,CRIPT


In [None]:
# DeepSeq dataset name(s):
  # alignment file name
  # Pfam
  # (sub)Family
  # PDB
  # Paperpile link
DS_info = {
  "BLAT_ECOLX_Ranganathan2015":  ["BLAT_ECOLX_1_b0.5.i90c80", "PF13354", "BLAT_ECOLX", "1ERO", ("A"),
    "https://paperpile.com/app/p/62172c11-f4a9-0343-82e7-1b97710eb5fc"],
  "MTH3_HAEAESTABILIZED_Tawfik2015": ["MTH3_HAEAESTABILIZED_1_b0.5.i90c80", "PF00145", "MTH3_HAEAE", "1DCT", ("A"),
    "https://paperpile.com/app/p/abda7505-edad-0aee-88b4-6a2643668220"],
  "PABP_YEAST_Fields2013":  ["PABP_YEAST_1_b0.5.i90c80", "PF00076", "PABP_YEAST", "6R5K", ("D"),
    "https://paperpile.com/app/p/201e25f8-a2b5-082a-88bc-0fe1d3a42fde"],
  "BG_STRSQ_hmmerbit": ["BG_STRSQ_1_b0.5.i90c80", "PF00232", "Q59976_STRSQ", "1GNX", ("A"),
    "https://paperpile.com/app/p/013946b3-c72b-0f72-9079-a806f0e89bc4"],
  "KKA2_KLEPN_Mikkelsen2014": ["KKA2_KLEPN_1_b0.3.i90c80", "PF01636", "KKA2_KLEPN", "1ND4", ("A"),
    "https://paperpile.com/app/p/12f370fc-1c06-02cb-9abe-096b65911ea1"],
  "YAP1_HUMAN_Fields2012": ["YAP1_HUMAN_1_b0.5.i90c80", "PF00397", "YAP1_HUMAN", "1K9Q", ("A"),
    "https://paperpile.com/app/p/e4d9eb89-b4d1-055a-bc36-3c411c6cb26e"],
  "parEparD_Laub2015_all": ["parEparD_3.i90c80", "PF05016,PF03693", "F7YBW7_MESOW,F7YBW8_MESOW", "5CEG", ("B", "A"),
    "https://paperpile.com/app/p/a9c1859f-03db-0c92-a069-b801de40d2db"],
  "AMIE_PSEAE_Whitehead": ["AMIE_PSEAE_1_b0.3.i90c80", "PF00795", "AMIE_PSEAE", "2UXY", ("A"),
    "https://paperpile.com/app/p/095f9ea7-9cdc-0048-bf1e-e873b2d40851"],
  "DLG4_RAT_Ranganathan2012": ["DLG4_RAT_2_b0.45.i90c80", "PF00595", "DLG4_RAT", "1TP3", ("A"),
    "https://paperpile.com/app/p/92f14073-00c8-0c2a-9639-11b9bad5de82"]
}
# # TODO! indexing issues
# todo_DS_info = {
#   "HSP82_YEAST_Bolon2016": ["HSP82_YEAST_1_b0.5.i90c80", "PF02518", "HATPase_c", "2CG9", 
#     "https://paperpile.com/app/p/889ff43d-64a8-090d-947a-5f42c90ce416"],
#   "PA_FLU_Sun2015": ["PA_FLU_1_b0.5.i90c80", "PF00603", "PA_I34A1", "2ZNL", 
#     "https://paperpile.com/app/p/f258e0fb-3326-0a94-b871-682ae7a64721"],
#   "YAP1_HUMAN_Fields2012-singles": ["YAP1_HUMAN_1_b0.5.i90c80", "PF00397", "YAP1_HUMAN", "1K9Q", 
#     "https://paperpile.com/app/p/e4d9eb89-b4d1-055a-bc36-3c411c6cb26e"],
# }
# # not as important
# todo_later = {
#   "GAL4_YEAST_Shendure2015": ["GAL4_YEAST_1_b0.6.i90c80", "PF00172", "GAL4_YEAST", "1D66", ("A"),
#     "https://paperpile.com/app/p/d92b65b9-5d7f-0147-9b3e-32ba377e5286"],
#   "HG_FLU_Bloom2016": ["HG_FLU_1_b0.5.i90c80", "PF00509", "HEMA_I34A1", "1RVX", ("A"),
#     "https://paperpile.com/app/p/41dcf6d4-1d32-065a-be45-b22d99ac1724"],
#   "HIS7_YEAST_Kondrashov2017":  ["HIS7_YEAST_1_b0.5.i90c80", "PF00475", "HIS7_YEAST", "6EZM", ("A"),
#     "https://paperpile.com/app/p/38397f51-0b78-020b-8dcf-dd674dd09899"],
#   "UBE4B_MOUSE_Klevit2013": ["UBE4B_MOUSE_1_b0.45.i90c80", "PF04564", "UBE4B_MOUSE", "2KR4", ("A"),
#     "https://paperpile.com/app/p/3fa26562-2ea9-0e55-8bb5-90391edea684"],
# }

**structure**

In [None]:
for ds, PDB in DS_info.items():
  PDB = PDB[3]
  PDB_in = F_dir+"PDBs/"+PDB+".pdb"
  PDB_out = F_dir+"cleaned/"+PDB+".con.txt"
  if not os.path.isfile(PDB_in) and not os.path.isfile(PDB_out):
    print(ds, PDB)
    # get
    !wget -q -nc "https://files.rcsb.org/view/"$PDB".pdb" -O $PDB_in
    # x-tal to contacts
    !./tools/confind --p $PDB_in --rLib ./rotlibs | grep "contact" > $PDB_out

# MSA

**generate**

In [None]:
print("\n".join(list(DS_info.keys())))

BLAT_ECOLX_Ranganathan2015
MTH3_HAEAESTABILIZED_Tawfik2015
PABP_YEAST_Fields2013
BG_STRSQ_hmmerbit
KKA2_KLEPN_Mikkelsen2014
YAP1_HUMAN_Fields2012
parEparD_Laub2015_all
AMIE_PSEAE_Whitehead
DLG4_RAT_Ranganathan2012


In [None]:
# choose
ds = "DLG4_RAT_Ranganathan2012"
print(DS_info[ds])
af = DS_info[ds][0]
dms = pd.concat([y for x, y in ds_supp[2].items() if ds in x], axis=0, sort=False)
pdb = F_dir+"cleaned/"+DS_info[ds][3]+".con.txt"
depth = F_dir+"DEPTH/pdb"+DS_info[ds][3].lower()+".ent-residue.depth"
chains = DS_info[ds][4]
out = ds+"-"+af

['DLG4_RAT_2_b0.45.i90c80', 'PF00595', 'DLG4_RAT', '1TP3', 'A', 'https://paperpile.com/app/p/92f14073-00c8-0c2a-9639-11b9bad5de82']


In [None]:
# alignment file
paf = parse_af(af.join([F_dir+"cleaned/", ".a2m"]), aa_babel["alphabet"])

In [None]:
# multiple sequence alignment
DATA = make_msa(paf["s"], aa_babel["a2i"])
DATA.update({x: y for x, y in paf.items() if "rs" in x})

In [None]:
# DMS data
DATA.update(**ds_dms(DATA["rs"], DATA["rs_ng"], DATA["X_ng"], dms, aa_babel["a2i"], 5))

In [None]:
# structure
DATA.update(**con_mtx(pdb, chains, aa_babel["tri2a"], DATA["rs"], DATA["rs_ng"], DATA["X_ng"], 5))

In [None]:
# amino acid depth
DATA.update(**get_depth(depth, chains, DATA["con2X_ng"], aa_babel["tri2a"]))

In [None]:
# check
for k, v in DATA.items(): print(k, v.shape)

X_ng (82,)
X (13568, 82, 21)
W (13568,)
rs (101,)
rs_ng (82,)
dX2X_ng (82, 2)
dX (1659, 82, 21)
dY (1659, 13)
con2X_ng (82, 2)
con (82, 82)
X_ng2depth (82, 2)


In [None]:
###### save data
print(f"saving {out}.npy")
np.save(F_dir+"collated/"+out, DATA)

saving DLG4_RAT_Ranganathan2012-DLG4_RAT_2_b0.45.i90c80.npy


# variants

Beta Lac w/ metagenomes

In [None]:
# # alignment file
# af = "/home/jupyter-dylan/data/BLAT_ECOLX_PF13354/BLAT_ECOLX_1.meta.i90c75.fas"
# paf = parse_af(af, aa_babel["alphabet"])
# out = ds+"-BLAT_ECOLX_1.meta.i90c75"

Beta Lac M184T

In [None]:
# dms = pd.concat([dms, pd.DataFrame({
#   x: y.apply(lambda z: "M180T:"+z).values if x == "mutant" else np.tile(np.nan, len(y))
#     for x, y in dms.items()
# })], 0, sort=False, ignore_index=True)
# out = ds+"-M180T-"+af

Chorismate Mutase

In [None]:
# # alignment file
#   # full sequence
# up_pf = "CMPDT_ECOLI_PF01817"
#   # only chorismate mutase domain
# # up_pf = "CMPDT_ECOLI_PF01817"
# pdb = F_dir+"cleaned/1ECM.con.txt"
# out = up_pf
# af = "/home/jupyter-dylan/data/"+up_pf.split(".")[0]+"/"+up_pf+".i90c80.a3m"
# paf = parse_af(af, aa_babel["alphabet"])
# # multiple sequence alignment
# DATA = make_msa(paf["s"], aa_babel["a2i"])
# DATA.update({x: y for x, y in paf.items() if "rs" in x})
# # structure
# DATA.update(**con_mtx(pdb, ("A", "B"), aa_babel["tri2a"], DATA["rs"], DATA["rs_ng"], DATA["X_ng"], 5))

parEparD_PF05016_PF03693

In [None]:
# # alignment file
# af = "/home/jupyter-dylan/data/parEparD_PF05016_PF03693/parEparD_PF05016_PF03693.a3m"
# paf = parse_af(af, aa_babel["alphabet"])
# out = ds+"-parEparD_PF05016_PF03693"

In [None]:
# DATA["rs_ng"] = np.arange(len(DATA["rs"]))
# DATA["X_ng"] = np.arange(len(DATA["rs"]))

PABP_RRM2

In [None]:
# af = "PABP_RRM2_PF00076.i90c80"
# paf = parse_af(af.join([F_dir+"cleaned/", ".a3m"]), aa_babel["alphabet"])
# out = ds+"-"+af

YAP1 from MaveDB

In [None]:
# ################################################################################
# def _YAP1_muts(x, tri2a=aa_babel["tri2a"]):
#   tri_muts = x.lstrip("p.").lstrip("\[").rstrip("\]").upper().split(";")
#   muts = []
#   for x in tri_muts:
#     wt, i, mut = x[:3], x[3:-3], x[-3:]
#     if wt not in tri2a.keys() or mut not in tri2a.keys(): return "error"
#     muts.append(tri2a[wt]+i+tri2a[mut])
#   return ":".join(muts)
# ################################################################################
# YAP1_MaveDB_in = pd.read_csv(F_dir+"/WW_domain_P46937/urn_mavedb_00000002-a-2_scores.csv", header=4)
# YAP1_MaveDB_in = YAP1_MaveDB_in[YAP1_MaveDB_in["hgvs_pro"].str.count("p.") > 0]
# YAP1_MaveDB_in["hgvs_pro"] = YAP1_MaveDB_in["hgvs_pro"].apply(lambda x: _YAP1_muts(x))
# YAP1_MaveDB_in = YAP1_MaveDB_in[~YAP1_MaveDB_in["hgvs_pro"].str.contains("error")]
# dms_MaveDB = pd.DataFrame({
#   "mutant": YAP1_MaveDB_in["hgvs_pro"].values,
#   **{x: YAP1_MaveDB_in[x].values for x in
#     ["score", "SE", "epsilon", "SE_101208", "score_101208", "SE_110307", "score_110307"]}
# })
# ################################################################################
# _dms = ds_dms(DATA["rs"], DATA["rs_ng"], DATA["X_ng"], dms_MaveDB, aa_babel["a2i"], 5)
# DATA["mDB_dX2X_ng"] = _dms["dX2X_ng"]
# DATA["mDB_dX"] = _dms["dX"]
# DATA["mDB_dY"] = _dms["dY"]
# DATA["mDB_dY"]["mDB"] = np.tile([True], len(DATA["mDB_dY"]))
# ################################################################################

In [None]:
# combine DMS
dX = np.concatenate([dX, msa["mDB_dX"]])
dY = pd.concat([dY, msa["mDB_dY"].copy()], 0)
mDB_aln = dict(np.concatenate([dX2X_ng, msa["mDB_dX2X_ng"]]))
dY["mDB_muts"] = dY["muts"].apply(lambda x: ["error"
  if y[:-1] not in mDB_aln.keys() else mDB_aln[y[1:-1]+y[0]]
    for y in x.split(":")
])