<a href="https://colab.research.google.com/github/dyl4nm4rsh4ll/funsae/blob/master/data_prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### imports / settings

In [None]:
# native libs
import itertools, os, pickle, re, sys, time, urllib
from functools import reduce
from io import StringIO
# external libs
from ete3 import NCBITaxa
ncbi = NCBITaxa()
import h5py
import matplotlib
from matplotlib import cm
from matplotlib import colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import signal as sig
from scipy import special
import scipy.stats as stats
from scipy.spatial.distance import jensenshannon, pdist, squareform, hamming
# settings
sns.set_style("ticks")

# MSA generation

**parameters**

In [None]:
# primary
_1_params = {
  "alphabet": "ARNDCQEGHILKMFPSTWYV-",
  "alignments_dir": "data/alignments/",
  "collated_dir": "data/collated/",
  "DMS_dir": "data/DMS/",
  "fig_dir": "figures/",
  "pk_data_dir": "data/peter_koo/50_synthetic_40/",
  "deep_seq_supp": "data/deep_seq_supp/",
  "predictions": "results/predictions/",
  "weights_EZ": "results/weights/EZ/",
  "weights_EZD": "results/weights/EZD/"
}
# secondary
_2_params = {
  "a2i": {AA: i for i, AA in enumerate(_1_params["alphabet"])},
  "i2a": {i: AA for i, AA in enumerate(_1_params["alphabet"])}
}

**methods**

In [None]:
def collate_dms(dms_data, wrt, considered="v_", dms_info=[
  "mut", "x", "y", "ind", "pw", "v_μ", "v_1", "v_2", "v_3", "v_4", "v_5"
]):
  """clean DMS data for evaluation
    dms_data := raw DMS data,
    wrt := reported DeepSeq data ...,
    considered := DMS experiment contextualizing edge cases,
    dms_info := reported DeepSeq data types"""

  # clean s.t. viable mutants wrt MSA
  dms_msa_pre = {
    dms: {
      mut: {
        k: v for k, v in xy.items()
      } for mut, xy in mut_xy.items() if xy["x"] is not None
    } for dms, mut_xy in dms_data.items()
  }
  # ensure edge cases (infs, nans) D.N.E.
  return {
    dms: {
      v: np.stack([
        val[v] for mut, val in dms_msa_pre[dms].items() if all([
          np.isfinite(dms_msa_pre[col][mut][wrt])
            for col in dms_data.keys() if considered in col
        ])
      ]) for v in dms_info
    } for dms in dms_data.keys()
  }

In [None]:
def clean_alignment(f_pref, f_suff, ret=False, num=0, aa=_1_params["alphabet"]):
  """prepare alignment file for HHblits := canonical / uppercase AA, periods --> hyphens
    f_pref / f_suff := alignment file prefix / suffix,
    ret := return for debugging purposes,
    num := initialize counter for number of sequences in alignment file,
    aa := considered amino acids"""
  # read, clean
  with open(f_pref + f_suff, "r") as f_i:
    parsed = {"head": [], "seqs": []}
    for line in f_i.readlines():
      # header
      if line.startswith(">"):
        num += 1
        parsed["head"].append(line)
        parsed["seqs"].append([])
      # sequence
      else: parsed["seqs"][-1].append(line)
  # join subsequences
  parsed["seqs"] = ["".join(seq) for seq in parsed["seqs"]]
  # clean sequences
  cleaned_seqs = [{
    i: x for i, x in enumerate(seq) if (x == x.upper() and x in aa)
  } for seq in parsed["seqs"]]
  # update parsed
  parsed["seqs"] = ["".join(list(seq.values())) for seq in cleaned_seqs]
  parsed.update({"idx": [list(seq.keys()) for seq in cleaned_seqs]})
  # dominant length mode (expected)
  exp_length = stats.mode(np.array([len(x) for x in parsed["seqs"]]))[0][0]
  # appropriately lengthed sequences
  result = {"head": [], "seqs": [], "idx": []}
  for i in range(num):
    if len(parsed["seqs"][i]) == exp_length:
      result["head"].append(parsed["head"][i].rstrip())
      result["seqs"].append(parsed["seqs"][i])
      result["idx"].append(parsed["idx"][i])
  # write
  with open(f_pref + f_suff + "_cleaned", "w") as f_o:
    f_o.write("\n".join([
      "\n".join([result["head"][j], result["seqs"][j]])
        for j in range(len(result["head"]))
    ]))
  # considered indices
  if ret: return result

In [None]:
def get_phyla(head, ncbi=ncbi):
  """MSA header taxonomy ID to phyla
    head := MSA headers
    ncbi := NCBI Taxonomy dictionary"""

  def _get_phyla(h):
    """map phylum to header (h)"""
    try:
      if "OX=" not in h: return "other"
      taxa = h.split("OX=")[1].split(" ")[0]
      return dict(zip(
        ncbi.get_rank(ncbi.get_lineage(int(taxa))).values(),
        ncbi.get_taxid_translator(ncbi.get_lineage(int(taxa))).values()
      ))["phylum"]
    except: return "other"

  # initialize
  phyla = {head[0]: "SOURCE"}
  phyla.update({h: _get_phyla(h) for h in head[1:]})
  return list(phyla.values())

In [None]:
def load_pkl(fname):
  """load pickled file"""
  if "pkl" in fname:
    with open(fname, "rb") as f: return pickle.load(f)
  else: print("check file"); return None

In [None]:
def make_msa(seqs, thresh={"gap": 0.5, "eff": 0.8}, a2i=_2_params["a2i"]):
  """converts list of sequences to multiple sequence alignment (MSA)
    seqs := cleaned / filtered MSA sequences,
    thresh := ignore MSA columns wrt gap / effective seq. weight wrt eff,
    a2i := amino acid to alphabet index"""

  def _check_AA(AA):
    if AA.upper() not in a2i.keys(): return "-"
    else: return AA.upper()

  # raw msa
  raw = np.array([[a2i[_check_AA(AA)] for AA in seq] for seq in seqs])  
  # non-gapped columns wrt gap threshold
  non_gap = np.where(np.mean((raw == a2i["-"]).astype(np.float), 0) < thresh["gap"])[0]
  # raw msa, non-gapped columns, cleaned msa, sequence weights
  return {
    "raw": np.eye(len(a2i))[raw],
    "non_gap": non_gap,
    "clean": np.eye(len(a2i))[raw[:, non_gap]],
    "weights": 1.0 / np.sum(1.0 * ((1.0 - squareform(pdist(raw, "hamming"))) >= thresh["eff"]), -1)
  }

In [None]:
def make_mut_msa(ref, mut_info, non_gap, a2i=_2_params["a2i"], cols=[
  "x", "y", "ind", "pw", "v_μ", "v_1", "v_2", "v_3", "v_4", "v_5"
]):
  """converts list of mutations to multiple sequence alignment (MSA)
    ref := reference MSA, one-hot encoded,
    mut_info := mutation information / measured & predicted,
    non_gap := mapping of valid indices from raw MSA to cleaned MSA,
    a2i := amino acid to integer,
    cols := pre-ordained columns of < mut_info > keyword argument"""
  
  def _ref2mut(_ref, _mut):
    """create one-hot encoded mutant from reference"""
    # decompose mutant amino acid context
    _aa_idx, i, j = int(_mut[1:-1]) - 1, a2i[_mut[0]], a2i[_mut[-1]]
    # valid MSA column
    if _aa_idx not in non_gap: return None
    elif i == j: return None
    else: aa_idx = np.where(non_gap == _aa_idx)[0][0]
    m = _ref.copy()
    assert(m[aa_idx, i] == 1), f"{_aa_idx}, {aa_idx}, {i}, {j}"
    assert(m[aa_idx, j] == 0), f"{_aa_idx}, {aa_idx}, {i}, {j}"
    m[aa_idx, i], m[aa_idx, j] = 0, 1
    return m

  # info per mutation
  return {
    x: {
      **{"mut": x, "x": _ref2mut(ref, x)},
      **{i: float(j) for i, j in zip(cols[1:], [y, ind, pw, v_μ, v_1, v_2, v_3, v_4, v_5])}
    } for x, y, ind, pw, v_μ, v_1, v_2, v_3, v_4, v_5
      in zip(*[mut_info[col].values for col in cols])
  }

In [None]:
def parse_fasta(fname):
  """disentangle FASTA headers and sequences
    fname := filename"""
  # open / define
  lines = open(fname, "r")
  parsed = {"head": [], "seq": []}
  # parse
  for line in lines:
    line = line.rstrip()
    try: 
      if line.startswith(">"):
        parsed["head"].append(line[1:])
        parsed["seq"].append([])
      else: parsed["seq"][-1].append(line)
    except: continue
  # close
  lines.close()
  # (headers, sequences)
  return {
    "head": np.array(parsed["head"]),
    "seqs": np.array(["".join(seq) for seq in parsed["seq"]])
  }

In [None]:
def sequence_identity(u, v):
  """calculate sequence identity for two sequences"""
  lengths = set([len(u), len(v)])
  assert len(lengths) == 1, print("hmmmm")
  length = list(lengths)[0]
  U = np.array([_2_params["i2a"][np.argmax(i)] for i in u])
  V = np.array([_2_params["i2a"][np.argmax(i)] for i in v])
  idx = np.setdiff1d(np.arange(length), np.concatenate([np.where(U == "-")[0], np.where(V == "-")[0]]))
  return 1 - hamming(U[idx], V[idx])

In [None]:
# Mutant Data Analysis

In [None]:
_supp_2 = pd.ExcelFile(
  _1_params["deep_seq_supp"] + "supp_2_41592_2018_138_MOESM4_ESM.xlsx"
)

**reference**

###### 0. search for homologous sequences

run hhblits on protein family

```bash
/home/jupyter-dylan/HH_SUITE/hh3_4x/bin/hhblits -id 100 -cov 10 -diff 0 -noaddfilter -maxmem 80 -contxt /home/UNICLUST/hh-suite/data/context_data.crf -d /home/UNICLUST/uniclust30_2018_08/uniclust30_2018_08 -n 8 -e 1 -cpu 16 -o /dev/null -i "protein alignment".fasta -oa3m "protein alignment".a3m
```

In [None]:
# 1. context, protein family
file_suff = {
  "raw": "a3m",
  "clean": "a3m_cleaned",
  "filt": "a3m_cleaned_filt",
  "meta_msa": "pkl",
  "dms": "DMS.pkl"
}
# define alignment file (DMS dir, DMS, DMS file prefix)
file_pref = _1_params["DMS_dir"] + "beta_lactamase/new_BLAT."

In [None]:
# 2. clean alignment file
clean_alignment(file_pref, file_suff["raw"])

###### 3. filter alignment file

run hhfilter 80 % coverage thresholds
```bash
/home/jupyter-dylan/HH_SUITE/hh3_4x/bin/hhfilter -cov 80 -M a3m -i "protein alignment".a3m_cleaned -o "protein alignment".a3m_cleaned_filt
```

In [None]:
# 4. headers / sequences from cleaned and filtered alignment file
parsed = parse_fasta(file_pref + file_suff["filt"])

In [None]:
# 5. define MSA from parsed sequences
meta_msa = make_msa(parsed["seqs"])

In [None]:
%%capture
# 6. assign phyla labels to MSA
meta_msa.update({"phyla": np.array(get_phyla(parsed["head"]))})

In [None]:
# 7. sequence identity for each MSA sequence wrt reference sequence
meta_msa.update({"seq_id": np.array([
  sequence_identity(meta_msa["clean"][0], x) for x in meta_msa["clean"]
])})

In [None]:
F, A = plt.subplots(figsize=(12, 4))
A.hist(meta_msa["seq_id"], log=True, bins=128)
A.set_title("my new msa seq. id %")
A.set_xticks(np.linspace(0, 1, num=21)); A.set_xlim([0, 1])
plt.tight_layout(); plt.show();

In [None]:
[(a, b.shape) for a, b in meta_msa.items()]

In [None]:
F, A = plt.subplots(figsize=(12, 4))
A.hist(meta_msa["seq_id"], log=True, bins=128)
A.set_title("my new msa seq. id %")
A.set_xticks(np.linspace(0, 1, num=21)); A.set_xlim([0, 1])
plt.tight_layout(); plt.show();

In [None]:
[(a, b.shape) for a, b in meta_msa.items()]

In [None]:
# 8. save
with open(file_pref + file_suff["meta_msa"], "wb") as f:
  pickle.dump(meta_msa, f, protocol=pickle.HIGHEST_PROTOCOL)

*Beta Lactamase, Ecoli*

In [None]:
# après. load
meta_msa = load_pkl(file_pref + file_suff["meta_msa"])

In [None]:
# evaluate
supp_2_blat = {
  x: _supp_2.parse(x)
    for x in _supp_2.sheet_names if x.startswith("BLAT")
}

In [None]:
# check 
check_blat_dms = pd.DataFrame({
  tag: supp_2_blat["BLAT_ECOLX_Ranganathan2015"][col].values
    for tag, col in {
      "meas_1": "2500_1",
      "meas_2": "2500_2",
      "meas_μ": "2500",
      "ind": "mutation_effect_prediction_independent",
      "pw": "mutation_effect_prediction_pairwise",
      "DS_1": "mutation_effect_prediction_vae_1",
      "DS_2": "mutation_effect_prediction_vae_2",
      "DS_3": "mutation_effect_prediction_vae_3",
      "DS_4": "mutation_effect_prediction_vae_4",
      "DS_5": "mutation_effect_prediction_vae_5",
      "DS_μ": "mutation_effect_prediction_vae_ensemble"
    }.items()
})
print(f"DeepSeq for Ranganathan2015:\n  {check_blat_dms.shape}\n")
check_blat_dms.dropna().corr("spearman").round(4)

# collate other DMS data

In [None]:
# visualize correlations
sns.pairplot(
  check_blat_dms[["meas_μ", "ind", "pw", "DS_μ"]].dropna(),
  diag_kind="kde", plot_kws={"s": 4, "alpha": 0.7}
);

In [None]:
# dms effects dataframes
blat_dms_effects_dfs = {
  dms.split("_")[-1]: pd.DataFrame({
    "x": supp_2_blat[dms.split(".")[0]]["mutant"],
    "y": supp_2_blat[dms.split(".")[0]][effect],
    "ind": supp_2_blat[dms.split(".")[0]]["mutation_effect_prediction_independent"],
    "pw": supp_2_blat[dms.split(".")[0]]["mutation_effect_prediction_pairwise"],
    "v_μ": supp_2_blat[dms.split(".")[0]]["mutation_effect_prediction_vae_ensemble"],
    **{"v_" + str(i): supp_2_blat[dms.split(".")[0]]["mutation_effect_prediction_vae_" + str(i)]
      for i in range(1, 6)}
  }) for dms, effect in {
    "BLAT_ECOLX_Ranganathan2015.1": "2500_1",
    "BLAT_ECOLX_Ranganathan2015.2": "2500_2",
    "BLAT_ECOLX_Ranganathan2015.μ": "2500",
    "BLAT_ECOLX_Palzkill2012": "ddG_stat",
    "BLAT_ECOLX_Tenaillon2013": "MIC_score",
    "BLAT_ECOLX_Ostermeier2014": "linear"
  }.items()
}

In [None]:
# check viability
ref_seq = parsed["seqs"][0]
print("ref_seq:\n" + ref_seq + "\n\n")
for x, y in supp_2_blat.items():
  z = [c for c, C in itertools.groupby([i[:-1] for i in y["mutant"].values])]
  z_aa = "".join([c[0] for c in z])
  z_idx_diff = ",".join(np.setdiff1d(
    np.array([int(c[1:]) for c in z]),
    meta_msa["non_gap"]
  ).astype("str"))
  z_idx_diff_inv = ",".join(np.setdiff1d(
    meta_msa["non_gap"],
    np.array([int(c[1:]) for c in z])
  ).astype("str"))
  print(x + "\n" + "\n".join([
    z_aa,
    "in DMS not in MSA ---> " + z_idx_diff,
    "in MSA not in DMS ---> " + z_idx_diff_inv
  ]) + "\n")

In [None]:
# create DMS MSA dictionary
blat_dms_msa = {
  dms: make_mut_msa(
    ref=meta_msa["clean"][0],
    mut_info=mut_info,
    non_gap=meta_msa["non_gap"]
  ) for dms, mut_info in blat_dms_effects_dfs.items()
}

In [None]:
# save
with open(file_pref + file_suff["dms"], "wb") as f:
  pickle.dump(blat_dms_msa, f, protocol=pickle.HIGHEST_PROTOCOL)

# etc

In [None]:
b62_raw = {
  "aa": np.array([
    "A", "R", "N", "D", "C", "Q", "E", "G",
    "H", "I", "L", "K", "M", "F", "P", "S",
    "T", "W", "Y", "V", "B", "Z", "X", "-"
  ]),
  "log_odds": """
    4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -4 
    -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -4 
    -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  3  0 -1 -4 
    -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4  1 -1 -4 
    0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 
    -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0  3 -1 -4 
    -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4 
    0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -2 -1 -4 
    -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0  0 -1 -4 
    -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3 -3 -1 -4 
    -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4 -3 -1 -4 
    -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0  1 -1 -4 
    -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3 -1 -1 -4 
    -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3 -3 -1 -4 
    -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -1 -2 -4 
    1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0  0  0 -4 
    0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1  0 -4 
    -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -3 -2 -4 
    -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -2 -1 -4 
    0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3 -2 -1 -4 
    -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -4 
    -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4 
    0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -4 
    -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1
  """
}
b62 = pd.DataFrame({
  aa: row for aa, row in zip(b62_raw["aa"], [row for row in np.array([
    x for x in b62_raw["log_odds"].replace("\n", " ").split(" ") if x != ""
  ]).reshape((24, 24))])
})[[c for c in b62_raw["aa"] if c in _1_params["alphabet"]]].iloc[[
  i for i, c in enumerate(b62_raw["aa"]) if c in _1_params["alphabet"]
]].astype("int")

### DeepSeq β-Lactamase

In [None]:
ds = _1_params["DMS_dir"] + "beta_lactamase/BLAT_ECOLX_hmmerbit_plmc_n5_m30_f50_t0.2_r24-286_id100_b105.fas"

In [None]:
ds

In [None]:
!head -n 10 data/DMS/beta_lactamase/BLAT_ECOLX_hmmerbit_plmc_n5_m30_f50_t0.2_r24-286_id100_b105.fas

In [None]:
def parse_ds(fname):
  """disentangle FASTA headers and sequences
    fname := filename"""
  # open / define
  lines = open(fname, "r")
  parsed = {"head": [], "seq": []}
  # parse
  for line in lines:
    line = line.rstrip()
    try: 
      if line.startswith(">"):
        parsed["head"].append(line[1:])
        parsed["seq"].append([])
      else: parsed["seq"][-1].append(line.upper())
    except: continue
  # close
  lines.close()
  # (headers, sequences)
  return {
    "head": np.array(parsed["head"]),
    "seqs": np.array(["".join(seq) for seq in parsed["seq"]])
  }

In [None]:
def make_msa_ds(seqs, thresholds={"gap": 0.5, "eff": 0.8}, a2i=_2_params["a2i"]):
  # raw msa
  raw = np.array([[a2i[AA] if AA in a2i else a2i["-"] for AA in seq] for seq in seqs])  
  # non-gapped columns wrt gap threshold
  non_gap = np.where(np.mean((
    raw == a2i["-"]).astype(np.float), 0
  ) < thresholds["gap"])[0]
  # effective weights via inverse, normalized, Hamming distance
  weights = 1.0 / np.sum((
    (1.0 - squareform(pdist(raw, "hamming"))) >= thresholds["eff"]
  ).astype(np.float), -1)
  # raw msa, non-gapped columns, cleaned msa, sequence weights
  return {
    "raw": np.eye(len(a2i))[raw],
    "non_gap": non_gap,
    "clean": np.eye(len(a2i))[raw[:, non_gap]],
    "weights": weights
  }

In [None]:
parsed_ds = parse_ds(f)

In [None]:
meta_msa_ds = make_msa_ds(parsed_ds["seqs"])

In [None]:
b


In [None]:
meta_msa_ds.update({"seq_id": np.array([
  sequence_identity(meta_msa_ds["clean"][0], x) for x in meta_msa_ds["clean"]
])})

In [None]:
meta_msa_ds["seq_id"]

### New β-Lactamase a3m

In [None]:
new = _1_params["DMS_dir"] + "beta_lactamase/new_beta_lactamase_P62593.a3m"
print(new)

In [None]:
!head -n 5 data/DMS/beta_lactamase/new_beta_lactamase_P62593.a3m

In [None]:
a = parse_fasta(new)

In [None]:
a["seqs"]

In [None]:
b = make_msa(a["seqs"])

In [None]:
def make_msa(seqs, thresholds={"gap": 0.5, "eff": 0.8}, a2i=_2_params["a2i"]):
  """converts list of sequences to multiple sequence alignment (MSA)
    seqs := cleaned / filtered MSA sequences,
    thresholds := ignore MSA columns wrt gap / effective seq. weight wrt eff,
    a2i := amino acid to alphabet index"""
  
  # raw msa
  raw = np.array([[a2i[AA] if AA in a2i else a2i["-"] for AA in seq] for seq in seqs])  
  # non-gapped columns wrt gap threshold
  non_gap = np.where(np.mean((
    raw == a2i["-"]).astype(np.float), 0
  ) < thresholds["gap"])[0]
  # effective weights via inverse, normalized, Hamming distance
  weights = 1.0 / np.sum((
    (1.0 - squareform(pdist(raw, "hamming"))) >= thresholds["eff"]
  ).astype(np.float), -1)
  # raw msa, non-gapped columns, cleaned msa, sequence weights
  return {
    "raw": np.eye(len(a2i))[raw],
    "non_gap": non_gap,
    "clean": np.eye(len(a2i))[raw[:, non_gap]],
    "weights": weights
  }

In [None]:
np.unique(np.array([len(x) for x in b]), return_counts=True)

**--------------------------------------------------------------------------------------------------------------**
# *_DEPRECATED_*
**--------------------------------------------------------------------------------------------------------------**

## methods

In [None]:
# def scrape_ids(x, head, ncbi=ncbi, batch_size=2000, head_split="/", head_split_idx=0):
#   """taxonomy ID from UniProf ID, DeepSequence MSA headers
#     x := MSA dataset name
#     head := MSA dataset
#     ncbi := NCBI Taxonomy dictionary
#     batch_size := number headers scraped from UniProt"""

#   def _get_phyla(taxa):
#     """map phylum"""
#     try: return dict(zip(
#       ncbi.get_rank(ncbi.get_lineage(int(taxa))).values(),
#       ncbi.get_taxid_translator(ncbi.get_lineage(int(taxa))).values()
#     ))["phylum"]
#     except: return "other"

#   def _scrape(_uniprot_id):
#     """scrape info from UniProt"""
#     try:
#       scrape = str(urllib.request.urlopen(
#         urllib.request.Request(
#           "https://www.uniprot.org/uploadlists/",
#           urllib.parse.urlencode({
#             "from": "NF100", "to": "NF100",
#             "columns": "id,commontaxonid", "format": "tab",
#             "query": " ".join(_uniprot_id),
#           }).encode("utf-8")
#       )).read().decode("utf-8"))
#       # batch validity
#       if len(scrape) > 0: return pd.read_csv(StringIO(scrape), sep="\t")
#       else: print("invalid batch"); return None
#     # UniProt servers too weak
#     except:
#       print("bounced, waiting..."); time.sleep(4)
#       return _scrape(_uniprot_id)

#   # initialize
#   head = [x.split(head_split)[head_split_idx] for x in head]
#   phyla = {head[0]: "SOURCE"}
#   uniprot_id = np.unique([head[1:]])
#   to_scrape = np.setdiff1d(uniprot_id, list(phyla.keys()))
#   print("scraping:", x)
#   # collate
#   while len(to_scrape) > 0:
#     # UniRef100 ID
#     batch = np.random.permutation(to_scrape)[:batch_size]
#     # scrape info from UniRef100 ID
#     info = _scrape(batch)
#     if info is None: continue
#     info = info[info["Common taxon ID"].notnull()]
#     # map UniRef100 to phylum
#     phyla.update({
#       cluster: _get_phyla(taxa) for cluster, taxa in dict(zip(
#         info["Cluster ID"].values,
#         info["Common taxon ID"].values
#       )).items()
#     })
#     # update for all headers
#     phyla.update({x: "other" for x in np.setdiff1d(
#       batch, list(phyla.keys())
#     )})
#     # update remaining headers to be scraped
#     to_scrape = np.setdiff1d(uniprot_id, list(phyla.keys()))
#   print("  unique head, phyla:", len(np.unique(head)), len(phyla))
#   return phyla

*clean / filter data*

In [None]:
# %%time
# # clean
# cleaned_a2ms = {
#   x[:-4]: clean_a2m(_1_params["alignments_dir"] + x)
#     for x in os.listdir(_1_params["alignments_dir"])
#       if x.endswith(".a2m")
# }

In [None]:
# %%bash
# # HHfilter
# dir="data/alignments/"
# export HHLIB=hhsuite-2.0.16-linux-x86_64
# for i in $(ls "$dir"); do
#   if [[ $i == *".a2m_cleaned"* ]]; then
#     hhsuite-2.0.16-linux-x86_64/bin/hhfilter -i "$dir"$i -id 99 -o "$dir"$i"_filt"
#   fi
# done

*create MSAs and MSA labels*

In [None]:
# %%time
# # hhfiltered alignments
# parsed_fastas = {
#   y[1:]: {
#     x.split(y)[0]: parse_fasta(_1_params["alignments_dir"] + x)
#       for x in os.listdir(_1_params["alignments_dir"]) if x.endswith(y)
#   } for y in [".a2m", ".a2m_cleaned", ".a2m_cleaned_filt"]
# }

In [None]:
# %%time
# # define MSA
# meta_msa = {
#   x: make_msa(parsed_fastas["a2m_cleaned_filt"][x]["seqs"])
#     for x in parsed_fastas["a2m_cleaned_filt"].keys()
# }

In [None]:
# %%time
# # scrape taxa
# meta_taxa = {
#   x: scrape_ids(x, parsed_fastas["a2m_cleaned_filt"][x]["head"])
#     for x in parsed_fastas["a2m_cleaned_filt"].keys()
# }

In [None]:
# %%time
# # map MSA headers to taxa labels
# meta_labels = {
#   x: {
#     head: meta_taxa[x][head.split("/")[0]]
#       for head in parsed_fastas["a2m_cleaned_filt"][x]["head"]
#   } for x in parsed_fastas["a2m_cleaned_filt"].keys()
# }

In [None]:
# %%time
# # map MSA headers with sequence indices remaining from original
# meta_orig_idx = {
#   x: {
#     head: cleaned_a2ms[x][head]
#       for head in parsed_fastas["a2m_cleaned_filt"][x]["head"]
#   } for x in parsed_fastas["a2m_cleaned_filt"].keys()
# }

In [None]:
# %%time
# # update meta_msa with labels
# for x, y in meta_labels.items(): meta_msa[x].update({"labels": y})
# # update meta_msa with original sequence indices
# for x, y in meta_orig_idx.items(): meta_msa[x].update({"idx": y})

In [None]:
# %%time
# # save MSAs
# for x in meta_msa.keys():
#   fname = _1_params["collated_dir"] + x + ".pkl"
#   print("pickling", fname)
#   with open(fname, "wb") as f:
#     pickle.dump(meta_msa[x], f, protocol=pickle.HIGHEST_PROTOCOL)

*mutant DMS*

In [None]:
# supp_2 = {
#   x: _supp_2.parse(x)#.dropna()
#     for x in _supp_2.sheet_names# if x.startswith("BLAT")
# }

In [None]:
# pkl = "BLAT_ECOLX_1_b0.5.pkl"
# msa = load_pkl(_1_params["collated_dir"] + pkl)
# ref = {
#   "head": list(msa["labels"].keys())[0],
#   "seq": msa["msa_clean"][0]
# }
# # valid columns indices
# assert all(
#   np.average(list(msa["idx"].values()), 0).astype("int") ==\
#   msa["idx"][ref["head"]]
# )
# # valid indices accounting for offset
# v_idx = np.array(msa["idx"][ref["head"]]) + int(ref["head"].split("/")[1].split("-")[0])
# # mutations, mutation effect
# MUT = {
#   "muts": supp_2["BLAT_ECOLX_Ranganathan2015"]["mutant"].values,
#   "vals": supp_2["BLAT_ECOLX_Ranganathan2015"]["2500"].values
# }