In [1]:
# Some constants that will be required
# ALWAYS RUN THIS CODE CELL
import os
from glob import glob

data_dir = os.path.abspath("../data/scPDB")
raw_dir = os.path.join(data_dir, "raw")
pssm_dir = os.path.join(data_dir, "pssm")
splits_dir = os.path.join(data_dir, "splits")
preprocessed_dir = os.path.join(data_dir, "preprocessed")

In [None]:
# Download the scPDB dataset and extract the dataset in "data/scPDB/raw"
# !aria2c -c -x 8 -s 8 -d "../data/scPDB" http://bioinfo-pharma.u-strasbg.fr/scPDB/ressources/2016/scPDB.tar.gz --out 'scPDB.tar.gz'
# !tar xvzf ../data/scPDB/scPDB.tar.gz -C ../data/scPDB/raw/

In [None]:
# For 10-fold Cross Validation, we will use the splits that were generated by https://arxiv.org/abs/1904.06517
# !aria2c -c -x 8 -s 8 -d "../data/scPDB" https://gitlab.com/cheminfIBB/kalasanty/-/archive/master/kalasanty-master.tar.gz?path=data --out 'kalasanty-master-data.tar.gz'
# !tar xvzf ../data/scPDB/kalasanty-master-data.tar.gz -C ../data/scPDB/

In [None]:
# For sequence-based prediction, we need to use RCSB FASTA files and since scPDB has only mol2 files for the proteins, we will download the fasta file and PDB files of the given proteins ourself. This allows for proper calculation of the labels
# Note that the downloaded PDB automatically has all the structures of a particular PDB ID
# Hence, we just use the first structure instead of all
# Download sequence and PDB files from RCSB for easier matching of labels
import urllib

for file in sorted(os.listdir(raw_dir)):
    pdb_id = file[:4]
    print(pdb_id)

    pdb_save = path.join(folder, file, "downloaded.pdb")
    if not path.exists(pdb_save):
        try:
            urllib.request.urlretrieve(
                "http://files.rcsb.org/download/" + pdb_id + ".pdb", pdb_save
            )
        except:
            print("Err: pdb " + pdb_id)

    fasta_save = path.join(folder, file, "sequence.fasta")
    if not path.exists(fasta_save):
        try:
            urllib.request.urlretrieve(
                "https://www.rcsb.org/pdb/download/downloadFastaFiles.do?structureIdList="
                + pdb_id
                + "&compressionType=uncompressed",
                fasta_save,
            )
        except:
            print("Err: fasta " + pdb_id)

In [None]:
# Check whether downloaded PDB and sequence files are correct
# A few of the PDB files have been obseleted and hence will have to manually download them
# I wrote down the manual mapping of the pdb id's but seem to have lost it
# But they can easily be found out from data/obseleted.txt file
import subprocess


def last_line(f):
    proc = subprocess.Popen(["tail", "-n", "1", f], stdout=subprocess.PIPE)
    line = proc.stdout.readlines()[0].decode("utf-8").strip()
    return line


for file in sorted(os.listdir(raw_dir)):
    pdb_id = file[:4]
    pdb_save = os.path.join(raw_dir, file, "downloaded.pdb")
    if last_line(pdb_save) != "END":
        print("Err: PDB " + file)
    fasta_save = os.path.join(raw_dir, file, "sequence.fasta")
    with open(fasta_save, "r") as f:
        line = f.readline()
        if line[0] != ">":
            print("Err: FASTA " + file)

In [None]:
# We need to generate MSAs for the protein sequences in the dataset
# For that, we need to split the sequence.fasta file into respective chain.fasta files
# Also, we need to remove the fasta files of DNA/RNA seqeuences

for file in sorted(os.listdir(raw_dir)):
    file = file.strip()
    pre = os.path.join(raw_dir, file)

    # Read SEQRES entries in PDB file to determine whether a chain
    # has a protein sequence or not
    pdb_file = os.path.join(pre, "downloaded.pdb")
    do_not_include = set()
    with open(pdb_file, "r") as f:
        line = f.readline()
        while line[:6] != "SEQRES":
            line = f.readline()
        while line[:6] == "SEQRES":
            chain_id = line[11]
            residue = line[19:22]
            # Generally DNA/RNA have 1 or 2-letter codes
            if " " in residue:
                do_not_include.add(chain_id)
            line = f.readline()

    fasta = os.path.join(pre, "sequence.fasta")
    with open(fasta, "r") as f:
        header = f.readline()
        while 1:
            chain_id = header[6:7]
            sequence = ""
            line = f.readline()
            while line != "" and line is not None and line[0] != ">":
                sequence += line.strip()
                line = f.readline()
            if chain_id not in do_not_include:
                with open(os.path.join(pre, chain_id + ".fasta"), "w") as hlp:
                    hlp.write(header)
                    hlp.write(sequence + "\n")
            if line == "" or line is None:
                break
            header = line

In [None]:
# In case you want to delete the generated fasta files from the above cell, use this
# for file in sorted(os.listdir(raw_dir)):
#     for fasta in glob(os.path.join(raw_dir, file.strip(), "?.fasta")):
#         os.remove(fasta)

In [None]:
# Let us remove some other troublesome fasta files
# trouble = ["1m1d_1/D.fasta", # The PDB file does not contain structure of this sequence at all
#             "2xbm_4/E.fasta", # RNA sequence that slipped past somehow
#             "2xbm_4/F.fasta"] # RNA sequence that slipped past somehow
# for chain in trouble:
#     file = os.path.join(raw_dir, chain)
#     if os.path.exists(file):
#         os.remove(file)

In [None]:
# The fasta files generated will have a lot of common sequences
# To speed up MSA generation, let us create a unique file that has common sequences
# Then we can generate the MSAs for only the first chain in every line
from collections import defaultdict

sequences = defaultdict(list)
for file in sorted(os.listdir(raw_dir)):
    pre = os.path.join(raw_dir, file.strip())
    for fasta in sorted(os.listdir(pre)):
        if fasta[2:] != "fasta":
            continue
        chain_id = fasta[0]
        with open(os.path.join(pre, fasta)) as f:
            f.readline()
            sequence = f.readline().strip()
            # This choice was made so that rsync would work much better and easier
            sequences[sequence].append(file + "/" + chain_id + "*")

keys = list(sequences.keys())

with open(os.path.join(data_dir, "unique"), "w") as f:
    for key in keys:
        line = ""
        for chain_id in sequences[key]:
            line += chain_id + " "
        f.write(line[:-1] + "\n")

In [None]:
# Run a regex search on the generated fasta files to ensure that we don't have any DNA/RNA sequences
# Will have to manually check the files to ensure that they are protein sequences
# All of them are protein sequences
import re


def match(strg, search=re.compile(r"[^ACGTURYKMSWBDHVN\-\.]").search):
    return not bool(search(strg))


with open(os.path.join(data_dir, "unique"), "r") as f:
    lines = f.readlines()

for line in lines:
    pdb_id_struct, chain_id = line.strip().split()[0].split("/")
    chain_id = chain_id[0]
    with open(os.path.join(raw_dir, pdb_id_struct, chain_id + ".fasta"), "r") as f:
        f.readline()
        seq = f.readline().strip()
    if match(seq):
        print(pdb_id_struct, chain_id)

In [None]:
# MSAs need to be generated for the fasta files
# Refer to https://github.com/crvineeth97/msa-generator

In [None]:
# MAKING OF .npz FILES FROM HERE
# Download NWalign.py, pdb2fasta.py and reindex_pdb.py
# https://zhanglab.ccmb.med.umich.edu/NW-align/NWalign.py (Make small changes for Python3 compatibility)
# https://zhanglab.ccmb.med.umich.edu/reindex_pdb/reindex_pdb.py (Make changes to allow for reindexing specific chains and also to replace celenocysteine, "U" with "X")
# https://zhanglab.ccmb.med.umich.edu/reindex_pdb/pdb2fasta.py (Almost the same)
# Let us preprocess ALL the available data and create .npz files which contain pdb_id, chain_id, sequence, length, labels
# %debug
# from IPython.core.debugger import set_trace
import warnings
from time import time

import numpy as np

from Bio import BiopythonWarning
from Bio.PDB import PDBParser, is_aa
from rdkit import Chem
from reindex_pdb import reindex_pdb

warnings.simplefilter("ignore", BiopythonWarning)
parser = PDBParser()


def initialize_protein_info(pdb_id_struct, chain_id):
    pre = os.path.join(raw_dir, pdb_id_struct)
    protein = {}
    protein["structure"] = parser.get_structure(
        pdb_id_struct, os.path.join(pre, "tmp.pdb")
    )

    protein["residues"] = []
    for res in protein["structure"][0][chain_id]:
        id = res.get_id()
        if is_aa(res, standard=False) and id[0] == " ":
            protein["residues"].append(res)

    protein["sequence"] = ""
    with open(os.path.join(pre, chain_id + ".fasta")) as f:
        line = f.readline()
        line = f.readline()
        while line != "" and line is not None:
            protein["sequence"] += line.strip()
            line = f.readline()
    return protein


def initialize_ligand_info(pdb_id_struct, chain_id):
    pre = os.path.join(raw_dir, pdb_id_struct)
    ligand = {}
    ligand["supplier"] = Chem.SDMolSupplier(
        os.path.join(pre, "ligand.sdf"), sanitize=False
    )
    assert len(ligand["supplier"]) == 1
    ligand["supplier"] = ligand["supplier"][0]
    assert ligand["supplier"].GetNumConformers() == 1
    ligand["coords"] = ligand["supplier"].GetConformer().GetPositions()
    ligand["num_atoms"] = ligand["supplier"].GetNumAtoms()
    assert ligand["num_atoms"] == len(ligand["coords"])
    ligand["atom_types"] = np.array(
        [atom.GetSymbol() for atom in ligand["supplier"].GetAtoms()]
    )
    return ligand


def find_residues_in_contact(protein, ligand):
    """
    Returns a numpy 1D array where a 1 represents that the amino acid is in
    contact with the ligand
    """
    labels = np.zeros(len(protein["sequence"]))
    for residue in protein["residues"]:
        res_ind = residue.get_id()[1] - 1
        for atom in residue.get_atoms():
            if atom.get_fullname()[1] == "H":
                continue
            for i in range(ligand["num_atoms"]):
                if ligand["atom_types"][i] == "H":
                    continue
                # We are considering the ligand to be in contact with the AA
                # if the distance between them is within 5A
                if np.linalg.norm(atom.get_coord() - ligand["coords"][i]) <= 5.0:
                    labels[res_ind] = 1
                    break
            # We know that the residue is in contact with ligand
            # So go to the next residue
            if labels[res_ind]:
                break
    return labels


def get_distance_map_true(protein, atom_type):
    length = len(protein["sequence"])
    num_residues = len(
        protein["residues"]
    )  # Might be different from length because of missing residues
    # Don't use np.inf, use a large number
    distance_map = np.full((length, length), 1e6)  # Initialize to infinite distance
    for ind1 in range(num_residues):
        res1 = protein["residues"][ind1]
        if res1.has_id(atom_type) is False:
            continue
        res1_ind = res1.get_id()[1] - 1
        for ind2 in range(ind1 + 1, num_residues):
            res2 = protein["residues"][ind2]
            if res2.has_id(atom_type) is False:
                continue
            res2_ind = res2.get_id()[1] - 1
            dist = np.linalg.norm(
                res1[atom_type].get_coord() - res2[atom_type].get_coord()
            )
            distance_map[res1_ind][res2_ind] = dist
            distance_map[res2_ind][res1_ind] = dist
    np.fill_diagonal(distance_map, 0.0)
    return distance_map


if not os.path.exists(preprocessed_dir):
    os.mkdir(preprocessed_dir)

process_time = 0
write_time = 0
for pdb_id_struct in sorted(os.listdir(raw_dir)):
    pre = os.path.join(raw_dir, pdb_id_struct)
    if not os.path.exists(os.path.join(pre, "downloaded.pdb")):
        print("Downloaded PDB does not exist for %s" % pdb_id_struct)
        continue
    process_time_start = time()
    for file in os.listdir(pre):
        # Get only the chain fasta sequences
        if file[2:] != "fasta":
            continue
        chain_id = file[0]

        # If our preprocessed file exists, continue
        if os.path.exists(
            os.path.join(preprocessed_dir, pdb_id_struct, chain_id + ".npz")
        ):
            continue

        print(pdb_id_struct, chain_id)
        # Reindex the chain and write to tmp.pdb
        dest = os.path.join(pre, "tmp.pdb")
        PDBtxt_reindex = reindex_pdb(
            os.path.join(pre, chain_id + ".fasta"),
            os.path.join(pre, "downloaded.pdb"),
            True,
        )
        # set_trace()
        if PDBtxt_reindex is None:
            print(pdb_id_struct, chain_id, "reindex fail")
            continue

        with open(dest, "w") as fp:
            fp.write(PDBtxt_reindex)

        # Initialize information required for the complex
        protein = initialize_protein_info(pdb_id_struct, chain_id)
        ligand = initialize_ligand_info(pdb_id_struct, chain_id)

        # Make the dictionary for storage
        try:
            data = {}
            data["pdb_id_struct"] = pdb_id_struct
            data["chain_id"] = chain_id
            data["sequence"] = protein["sequence"]
            data["length"] = len(data["sequence"])
            data["labels"] = find_residues_in_contact(protein, ligand)
            data["ca_dist_map_true"] = get_distance_map_true(protein, "CA")
            assert len(data["sequence"]) == len(data["labels"])
        except:
            print(pdb_id_struct, chain_id, "dictionary fail")
            continue

        # Remove the tmp.pdb file
        os.remove(dest)
        process_time += time() - process_time_start

        # Write the data to a numpy .npz file
        write_time_start = time()
        folder = os.path.join(preprocessed_dir, pdb_id_struct)
        if not os.path.exists(folder):
            os.mkdir(folder)
        np.savez(os.path.join(folder, chain_id + ".npz"), **data)
        write_time += time() - write_time_start

print("Processing time:", process_time)
print("Write time:", write_time)

In [None]:
# Let us take all the amino acid properties available from the AAindex database
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2238890/
# The file AA_properties in the data folder contains all the values
# We will take only the features which are the least correlated as our features
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib import cm as cm

%matplotlib inline

tmp_df = pd.read_csv("../data/AA_properties.csv", sep=",")
df = tmp_df.iloc[:, 7:].T

# Function used to normalize the values between 0 and 1
def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result


# A way to view the correlation matrix
def correlation_matrix(df):
    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    cmap = cm.get_cmap("jet", 30)
    cax = ax1.imshow(df.corr(), interpolation="nearest", cmap=cmap)
    ax1.grid(True)
    plt.title("Feature Correlation")
    labels = np.arange(0, len(df), 1)
    ax1.set_xticklabels(labels, fontsize=6)
    ax1.set_yticklabels(labels, fontsize=6)
    # Add colorbar, make sure to specify tick locations to match desired ticklabels
    fig.colorbar(cax, ticks=np.arange(-1.1, 1.1, 0.1))
    plt.show()


# Using spearman correlation
corr = df.corr("spearman")
threshold = 0.6
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i + 1, corr.shape[0]):
        if corr.iloc[i, j] >= threshold or corr.iloc[i, j] <= -threshold:
            if columns[j]:
                columns[j] = False
selected_columns = df.columns[columns]
features = df[selected_columns]

# Saving all the features and the selected features
normalize(df).to_csv("../data/all_features.csv")
normalize(features).to_csv("../data/selected_features.csv")

In [2]:
# Assuming that all the PSSMs have been copied to data/scPDB/pssm
# We can have more features included as well. For now, let us consider PSSMs
# !rsync -avP --include="*/" --include="*pssm" --exclude="*" ~/Git/msa-generator/data/scPDB/ ~/Git/protein-binding-site-prediction/data/scPDB/pssm/

# Get amino acid properties from the created files above
import csv
from collections import defaultdict

import numpy as np


def get_amino_acid_properties(csv_file):
    feats = {}
    with open(csv_file) as f:
        records = csv.reader(f)
        for i, row in enumerate(records):
            if i == 0:
                length = len(row) - 1
                continue
            feats[row[0]] = np.array(
                [float(el) if el != "" else 0.0 for el in row[1:]], dtype=np.float32
            )
        feats["X"] = np.zeros(length)
    feats = defaultdict(lambda: np.zeros(length), feats)
    return feats


# List of amino acids and their integer representation
AA_ID_DICT = {
    "X": 0,
    "A": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5,
    "G": 6,
    "H": 7,
    "I": 8,
    "K": 9,
    "L": 10,
    "M": 11,
    "N": 12,
    "P": 13,
    "Q": 14,
    "R": 15,
    "S": 16,
    "T": 17,
    "V": 18,
    "W": 19,
    "Y": 20,
}
AA_ID_DICT = defaultdict(lambda: 0, AA_ID_DICT)

# One-hot encoding and positional encoding
feat_vec_len = 21
feat_vec_len += 1

# We generated PSSMs for select sequences
# Mapping different sequences to the one for which we generated
common_pssms = defaultdict(str)
with open(os.path.join(data_dir, "unique"), "r") as f:
    for line in f.readlines():
        line = line.strip().split()
        pssm_generated_for = line[0][:-1]
        common_pssms[pssm_generated_for] = pssm_generated_for
        for pdb_id_struct_chain in line[1:]:
            pdb_id_struct_chain = pdb_id_struct_chain[:-1]
            common_pssms[pdb_id_struct_chain] = pssm_generated_for


def get_pssm(pdb_id_struct, chain_id, length):
    pssm_pdb_id_struct, pssm_chain_id = common_pssms[
        pdb_id_struct + "/" + chain_id
    ].split("/")
    with open(
        os.path.join(pssm_dir, pssm_pdb_id_struct, pssm_chain_id + ".pssm"), "r"
    ) as f:
        lines = f.readlines()
    feature = np.zeros((21, length))
    for i, line in enumerate(lines):
        feature[i] = np.array(line.strip().split(), dtype=np.float32)
    return feature


# PSSM length
feat_vec_len += 21

# Amino acid physico-chemical features selected by removing highly correlated features
AA_sel_feats = get_amino_acid_properties(
    os.path.join(os.path.dirname(data_dir), "selected_features.csv")
)
feat_vec_len += len(AA_sel_feats["X"])

In [None]:
# Now, let us preprocess the files again to generate the features
# directly that can be imported into pytorch easily
# For that we can define the 2 cells below where the generate_input function
# can be used to generate various types of inputss

In [11]:
# Without using distance map
# def generate_input(sample):
#     """
#     Generate input for a single sample
#     """
#     X = np.zeros((feat_vec_len, sample["length"]))

#     # One-hot encoding
#     X[:21] = np.array([np.eye(21)[AA_ID_DICT[el]] for el in sample["sequence"]]).T

#     # Positional encoding
#     X[21] = np.arange(1, sample["length"] + 1, dtype=np.float32) / sample["length"]

#     # PSSM
#     X[22:43] = sample["pssm"]

#     # AA Properties
#     X[43:] = np.array([AA_sel_feats[aa] for aa in sample["sequence"]]).T

#     return X


# Using the distance map to pick the closes 20 residues and creating features for each amino acid
# independent of the total protein structure
# Let us not use one-hot and positional encoding to save space and include distance?
# Amino acid no. + Distance from curr. aa + PSSM + AA Properties
feat_vec_len = 1 + 1 + 21 + len(AA_sel_feats["X"])
close_aa = 10
import numpy as np


def generate_input(sample):
    X = np.zeros((feat_vec_len * close_aa, sample["length"]))
    for i, aa_dist in enumerate(sample["ca_dist_map_true"]):
        if sample["length"] <= close_aa:
            indices = np.argsort(aa_dist)
        else:
            # Selects the smallest `close_aa` elements not in sorted order
            indices = list(np.argpartition(aa_dist, close_aa)[:close_aa])
            # Sort the indices according to their value
            indices.sort(key=lambda x: aa_dist[x])
        for j, idx in enumerate(indices):
            if aa_dist[idx] == 1e6:
                # Implies we don't have structural info about any AA after this
                break
            aa = sample["sequence"][idx]
            X[j * feat_vec_len, i] = AA_ID_DICT[aa] / 20
            X[j * feat_vec_len + 1, i] = aa_dist[idx]
            X[j * feat_vec_len + 2 : j * feat_vec_len + 2 + 21, i] = sample["pssm"][:, idx]
            X[j * feat_vec_len + 2 + 21 : (j + 1) * feat_vec_len, i] = np.array(AA_sel_feats[aa])
    return X


# With an inverted distance map
# This does not work very well
# def generate_input(sample):
#     X = np.zeros((feat_vec_len, sample["length"]))

#     # One-hot encoding
#     X[:21] = np.array([np.eye(21)[AA_ID_DICT[el]] for el in sample["sequence"]]).T

#     # PSSM
#     X[22:43] = sample["pssm"]

#     # AA Properties
#     X[43:] = np.array([AA_sel_feats[aa] for aa in sample["sequence"]]).T

#     # Invert the distance map and matrix multiply with X so that we get a combination of all features
#     inverted_dist = 1 / sample["ca_dist_map_true"]
#     np.fill_diagonal(inverted_dist, 1.0)
#     X_T = inverted_dist.dot(X.T)
#     for i in range(sample["length"]):
#         X_T[i] = X_T[i] / np.sum(inverted_dist[i])

#     X = X_T.T

#     # Positional encoding
#     X[21] = np.arange(1, sample["length"] + 1, dtype=np.float32) / sample["length"]

#     return X

In [12]:
# USING CONCATENATION STRATEGY

for pdb_id_struct in sorted(os.listdir(preprocessed_dir)):
    flg = True
    pre = os.path.join(preprocessed_dir, pdb_id_struct)
    features_file = os.path.join(pre, "features.npy")
    labels_file = os.path.join(pre, "labels.npy")

    if os.path.exists(labels_file):
        continue
    print(pdb_id_struct)

    for file in sorted(os.listdir(pre)):
        # In case features were generated but not labels, redo it
        if file == "features.npy":
            continue
        chain_id = file[-len(".npz") - 1 : -len(".npz")]
        sample = np.load(os.path.join(pre, file))
        sample = {
            key: sample[key].item() if sample[key].shape is () else sample[key]
            for key in sample
        }
        sample["pssm"] = get_pssm(pdb_id_struct, chain_id, sample["length"])
        if flg:
            X = generate_input(sample)
            y = sample["labels"]
            flg = False
        else:
            # Using concatenation strategy
            tmp = generate_input(sample)
            X = np.concatenate((X, tmp), 1)
            y = np.concatenate((y, sample["labels"]), 0)

    np.save(features_file, X)
    np.save(labels_file, y)


1ae8_1
1af0_1
1af7_1
1afe_1
1afs_1
1agn_3
1agw_1
1ah0_1
1ah3_1
1ah4_1
1ahb_1
1ahg_2
1ahh_2
1ahi_2
1ahn_1
1ai0_3
1ai9_2
1aid_1
1aiq_2
1aiy_6
1aj0_1
1aj0_2
1aj2_1
1aj8_1
1ajv_1
1ajx_1
1aka_1
1akb_1
1akc_1
1ake_1
1akr_1
1aku_1
1akw_1
1aky_2
1al7_1
1al8_1
1am1_1
1am4_2
1amo_1
1amo_5
1amo_6
1amw_1
1an5_1
1ank_2
1ao0_1
1ao0_5
1ao8_1
1aob_1
1aoe_2
1aq1_1
1aq2_1
1aqb_1
1aqi_1
1aqu_2
1aqv_1
1aqx_3
1arg_2
1arh_2
1arz_1
1asb_1
1asc_1
1atl_1
1atn_1
1atr_1
1ats_1
1aux_1
1av5_1
1avd_2
1axe_2
1axg_4
1axw_2
1ay0_1
1ay5_1
1ayl_1
1ayp_5
1az1_2
1az2_1
1azl_1
1azt_2
1b11_1
1b14_2
1b15_2
1b16_2
1b2l_1
1b2l_2
1b2r_1
1b38_1
1b39_1
1b3d_1
1b3r_2
1b48_1
1b4p_1
1b4v_1
1b5d_2
1b5e_2
1b5q_2
1b5t_2
1b6k_1
1b6l_1
1b6m_1
1b6p_1
1b7a_1
1b7t_1
1b7y_1
1b87_1
1b8n_1
1b8o_1
1b8s_1
1b8u_1
1b8v_1
1b8y_1
1b9i_1
1bai_1
1bc5_1
1bcp_1
1bcu_1
1bd4_3
1bdb_1
1bdi_1
1bdl_1
1bdm_2
1bdq_1
1bdr_1
1bdu_1
1be4_1
1beu_1
1bfd_1
1bgq_1
1bh5_3
1bi9_4
1bid_1
1bif_1
1bil_1
1bim_1
1biw_1
1bjk_1
1bjq_6
1bk0_1
1bkf_1
1bkg_1
1bl4_2
1bl6_1
1bl7_1

1i7m_1
1i7p_1
1i80_2
1i80_3
1i8t_2
1i8z_1
1i90_1
1i91_1
1i9c_1
1i9g_1
1i9h_1
1i9l_2
1i9m_2
1i9n_1
1i9o_2
1i9p_1
1i9q_2
1ia1_2
1ia1_3
1ia2_2
1ia3_2
1ia4_2
1ia9_1
1iah_2
1iay_1
1ib0_1
1ib0_2
1ib1_1
1ib6_2
1ibr_2
1ibs_2
1icp_2
1icq_1
1icq_3
1ics_1
1ict_2
1icu_3
1icu_5
1icv_1
1icv_4
1id0_1
1ida_1
1idb_1
1idt_1
1ie3_4
1ie4_2
1ie8_1
1ie9_1
1iei_1
1iep_1
1if4_1
1if5_1
1if6_1
1if7_1
1if8_1
1ifu_1
1ifx_1
1ig0_1
1ig1_1
1ih7_1
1ih8_1
1ihi_2
1ihx_3
1ihy_3
1ii6_2
1iid_1
1iin_1
1iiq_1
1iiu_1
1ij8_1
1ije_1
1ijh_1
1ijj_1
1ijj_3
1ikg_1
1iki_1
1ikv_1
1ikw_1
1ikx_1
1iky_1
1il0_3
1il0_4
1im2_1
1in5_1
1iol_1
1ipe_1
1ipf_1
1iqe_1
1iqg_1
1iqh_1
1iqr_1
1ir3_1
1irj_3
1isg_1
1isi_2
1isj_1
1ism_1
1it7_1
1it8_1
1itz_2
1iv2_4
1iv4_3
1ivh_2
1ivp_1
1ivq_1
1ivr_1
1ivs_2
1iwi_1
1iwj_1
1ix1_2
1ixe_2
1iy8_5
1iyz_1
1izh_1
1izi_1
1j07_1
1j0b_9
1j0d_2
1j0x_2
1j1b_1
1j1c_2
1j1z_4
1j21_4
1j2g_4
1j36_2
1j37_1
1j39_1
1j3i_1
1j3j_1
1j3j_3
1j3j_6
1j3k_2
1j3k_4
1j49_1
1j4h_1
1j4j_1
1j5i_1
1j5p_1
1j6z_1
1j78_1
1j7k_1
1j7l_1
1j7u_2

1pd9_1
1pdh_1
1pdh_2
1peo_1
1peq_1
1pf7_1
1pf8_1
1pf9_2
1pfy_1
1pg0_1
1pg2_1
1pg3_1
1pg8_2
1pgt_2
1phd_1
1phe_1
1phg_1
1phh_1
1phk_1
1php_1
1phq_1
1pi3_1
1piv_1
1piw_1
1pj2_5
1pj3_5
1pj4_5
1pj6_1
1pj7_1
1pjc_1
1pjk_1
1pjl_16
1pk7_2
1pk8_1
1pk9_2
1pkd_2
1pke_3
1pkf_1
1pkg_2
1pkv_1
1pl1_2
1pl2_2
1pl6_3
1pl6_5
1pl8_4
1pl9_1
1pme_1
1pmn_1
1pmu_1
1pmv_1
1pn3_1
1pn4_3
1pn9_1
1pnl_1
1pno_2
1pnq_1
1pnr_1
1pnv_1
1po7_1
1pow_1
1pow_2
1pox_1
1pox_4
1pp9_1
1ppj_2
1ppj_3
1ppk_1
1ppk_2
1ppl_1
1ppm_1
1ppr_11
1pq6_1
1pq9_4
1pqc_4
1pr0_3
1pr1_2
1pr4_3
1pr5_3
1pr6_3
1pr9_2
1prc_1
1pro_1
1ps9_1
1ps9_2
1psa_1
1psd_1
1pt5_2
1pt8_1
1pt9_2
1ptj_1
1ptj_2
1pu8_2
1pu9_1
1pua_1
1pvd_2
1pvg_1
1pvo_6
1pvs_2
1pw1_1
1pw6_2
1pw7_3
1pwl_1
1pwm_1
1pwu_2
1pwy_1
1pwz_1
1px0_2
1px2_1
1pxg_1
1pxj_1
1pxk_1
1pxl_1
1pxm_1
1pxo_1
1pxp_1
1pxx_4
1py5_1
1pyd_2
1pye_1
1pyx_2
1pz0_1
1pz1_2
1pzf_3
1pzg_3
1pzh_1
1pzo_2
1pzp_1
1q0b_2
1q0q_1
1q0r_1
1q0z_1
1q12_4
1q13_1
1q19_4
1q1g_2
1q1r_2
1q1w_1
1q20_1
1q22_1
1q23_7
1q24_1
1q2r_1
1q3a

1wsb_1
1wsv_2
1wtg_1
1wua_1
1wua_2
1wuq_2
1wur_4
1wuu_2
1wuw_1
1wv4_2
1wv7_1
1wvg_2
1wvg_3
1wvx_1
1wvy_1
1wwk_2
1wwz_2
1wxj_1
1wxy_1
1wy7_2
1wyg_1
1wyg_2
1wze_1
1wzi_2
1wzn_1
1wzy_1
1x01_2
1x09_2
1x0p_3
1x14_1
1x15_1
1x1a_1
1x1b_1
1x1c_1
1x1d_1
1x1j_1
1x1r_1
1x27_1
1x27_3
1x28_2
1x2b_1
1x2e_1
1x2h_1
1x31_2
1x3n_1
1x76_1
1x78_1
1x7b_1
1x7d_2
1x7e_1
1x7g_2
1x7h_2
1x7j_2
1x7r_1
1x88_2
1x8b_1
1x8j_1
1x8l_2
1x8v_1
1x96_1
1x98_1
1x98_2
1xa4_1
1xa5_1
1xag_1
1xah_1
1xaj_2
1xal_2
1xb0_5
1xb1_2
1xbt_7
1xcb_7
1xco_5
1xd9_2
1xdd_1
1xdg_2
1xdi_2
1xdk_2
1xdp_2
1xdq_2
1xds_1
1xdu_1
1xdy_7
1xe5_1
1xe6_2
1xe7_2
1xef_3
1xel_1
1xf0_1
1xg5_3
1xgi_1
1xgj_2
1xh4_1
1xh5_1
1xh6_1
1xh7_1
1xh8_1
1xh9_1
1xha_1
1xhl_2
1xi2_2
1xi9_2
1xid_1
1xiu_1
1xj7_1
1xjd_1
1xje_1
1xje_4
1xjg_2
1xjj_1
1xjk_2
1xjk_3
1xjm_2
1xjn_4
1xjn_6
1xjq_1
1xjw_1
1xk9_2
1xkd_2
1xkk_1
1xkq_4
1xkv_1
1xl2_1
1xl5_1
1xl8_1
1xli_1
1xls_1
1xls_5
1xm1_1
1xmm_2
1xms_1
1xmv_1
1xng_2
1xnj_1
1xnx_2
1xny_3
1xny_4
1xnz_1
1xo2_1
1xoi_2
1xom_1
1xon_2
1xoq_2

2dxs_2
2dxv_1
2dxw_1
2dxx_1
2dy9_1
2dza_1
2dzb_2
2e07_1
2e08_1
2e0a_2
2e0i_4
2e0n_1
2e15_1
2e16_1
2e17_1
2e1m_1
2e1t_1
2e1z_1
2e20_1
2e2b_2
2e2p_1
2e2q_1
2e37_3
2e40_1
2e41_1
2e48_1
2e4n_1
2e5a_1
2e5m_1
2e77_3
2e7f_1
2e7r_1
2e7z_1
2e82_4
2e83_1
2e8h_1
2e8q_1
2e8r_1
2e8s_1
2e8t_2
2e8u_1
2e8w_1
2e91_2
2e93_2
2e94_2
2e95_2
2e98_1
2e99_1
2e9a_1
2e9c_4
2e9d_1
2e9n_1
2e9o_1
2e9p_1
2e9u_1
2e9v_2
2e9z_1
2ea1_1
2ea2_1
2ea4_1
2eat_1
2eat_2
2eau_1
2eb3_1
2eba_3
2ec9_1
2eck_1
2ed3_1
2ed4_3
2ed4_4
2ed5_1
2eeq_1
2eer_1
2eft_1
2efy_2
2eg2_1
2eg5_4
2eg8_1
2egb_1
2egh_1
2egl_1
2egs_1
2egv_2
2egw_2
2eh2_1
2eh4_1
2eh5_1
2ehl_1
2ehq_1
2ehu_1
2ei0_2
2ei7_1
2eii_1
2eit_1
2eix_2
2ej2_3
2ejj_1
2ejk_1
2eju_1
2ejv_2
2ejz_2
2ek2_1
2ek3_1
2ek4_1
2ek7_1
2ek9_1
2eka_1
2ekl_1
2ekp_1
2el0_1
2el3_1
2eld_1
2ele_1
2emr_1
2emu_1
2en5_1
2eni_1
2ep7_1
2eph_1
2eq6_2
2eq7_1
2eq7_4
2eq8_1
2eq9_7
2erp_2
2erz_1
2esa_1
2esd_3
2etk_1
2etm_1
2etr_2
2eu3_1
2eu8_1
2eud_1
2eud_2
2euf_1
2euh_1
2ev9_1
2eva_1
2evc_1
2evm_1
2evo_1
2ew5_1

2ohi_8
2ohj_4
2ohl_1
2ohm_1
2ohp_1
2ohq_1
2ohr_1
2oht_1
2ohu_1
2ohv_1
2ohx_1
2oi6_2
2oiq_1
2oj9_1
2ojf_1
2ojg_1
2oji_1
2ojj_1
2ojt_2
2ok7_5
2ok7_8
2ok8_2
2okc_1
2oke_2
2okl_1
2ol0_1
2ol4_2
2olc_2
2oli_1
2olo_1
2olq_1
2olr_1
2olv_1
2om9_8
2ome_1
2on6_1
2onf_1
2onm_11
2onm_2
2onp_2
2ony_1
2onz_1
2oo5_2
2oo7_1
2oor_2
2oos_2
2ooy_1
2op0_1
2op1_2
2op9_1
2opb_1
2opm_1
2opn_1
2opp_1
2opq_1
2opr_1
2ops_1
2opx_1
2ore_1
2oro_1
2ors_1
2os1_1
2os3_1
2osb_1
2osc_1
2osf_1
2osm_1
2oth_1
2oth_2
2ou2_1
2ou5_1
2oub_1
2our_1
2ouu_1
2ouy_1
2ouz_1
2ov2_2
2ovh_1
2ovm_1
2ovv_1
2ovx_2
2ovy_1
2ovz_2
2ow0_2
2ow1_2
2ow2_2
2ow3_1
2ow9_1
2owb_1
2owf_1
2owg_1
2owk_1
2owu_1
2owv_1
2oww_1
2oxd_1
2oxi_2
2oxn_1
2oxt_4
2oxw_1
2oxx_1
2oxy_2
2oy0_2
2oy2_1
2oye_1
2oyf_1
2oyl_1
2oym_2
2oyu_1
2oz0_2
2oz5_3
2oz6_2
2ozg_1
2ozo_1
2ozr_7
2ozu_1
2p0a_2
2p0m_1
2p0r_2
2p15_1
2p1d_2
2p1o_1
2p1p_1
2p1t_1
2p1u_1
2p1v_1
2p20_2
2p2b_1
2p2b_3
2p2f_2
2p2h_1
2p2i_1
2p2j_2
2p2l_2
2p2m_1
2p2q_2
2p2x_1
2p33_1
2p35_2
2p3a_1
2p3b_1
2p3c_1
2p3d_

2ws7_1
2wsa_1
2wsa_2
2wsb_4
2wsi_1
2wtc_1
2wtd_1
2wti_1
2wtj_1
2wtk_1
2wu1_1
2wu2_2
2wu4_1
2wu5_2
2wue_1
2wuf_1
2wug_1
2wuu_1
2wuz_2
2wvj_2
2wvl_1
2wvm_2
2ww4_1
2wwj_2
2wx2_1
2wxv_2
2wya_3
2wyj_1
2wyv_2
2wzb_1
2wzg_1
2wzm_1
2wzv_2
2wzw_2
2wzw_3
2wzy_4
2x06_8
2x0e_1
2x0f_1
2x0i_1
2x0q_1
2x0r_2
2x0v_1
2x0w_1
2x0y_2
2x19_1
2x1e_1
2x1h_3
2x1l_1
2x1n_2
2x1z_3
2x20_3
2x21_3
2x22_2
2x23_2
2x2k_1
2x2l_1
2x2m_1
2x2n_4
2x2r_1
2x2r_5
2x3f_2
2x3j_2
2x3n_1
2x45_1
2x4f_2
2x4z_1
2x5w_1
2x5z_1
2x60_1
2x6d_1
2x6o_1
2x6t_2
2x72_1
2x7b_1
2x7c_1
2x7d_1
2x7e_1
2x7e_3
2x7f_5
2x7g_1
2x7h_1
2x7j_4
2x7s_2
2x7t_1
2x7u_1
2x81_1
2x86_3
2x86_7
2x8e_1
2x8g_1
2x8h_1
2x8i_1
2x90_1
2x91_1
2x92_1
2x93_1
2x94_1
2x95_1
2x96_1
2x97_1
2x9d_1
2x9f_1
2x9g_1
2x9n_2
2x9v_3
2xa4_2
2xaa_3
2xab_2
2xae_3
2xae_5
2xan_2
2xau_1
2xb5_1
2xb7_1
2xb8_1
2xb9_1
2xba_1
2xbj_1
2xbk_1
2xbm_4
2xbx_1
2xby_1
2xc3_1
2xc5_1
2xch_1
2xck_1
2xcl_1
2xcm_1
2xd6_1
2xda_1
2xdo_2
2xdr_2
2xe7_1
2xe8_1
2xef_1
2xeg_1
2xei_1
2xej_1
2xey_1
2xez_1
2xf0_1
2xf3_1

3cas_2
3cav_1
3cb2_1
3cbg_1
3cbl_1
3cbp_1
3cbs_1
3ccb_3
3ccc_1
3cd2_1
3cd3_1
3cd5_4
3cde_1
3ce0_1
3ce3_1
3cea_3
3ceh_1
3cem_1
3cf0_13
3cfo_1
3cgb_1
3cgb_2
3cgc_1
3cgc_2
3cgd_1
3cgd_3
3cgd_5
3cge_1
3cge_3
3cge_5
3cgf_1
3cgo_1
3cgt_1
3cgy_1
3ch6_2
3cho_1
3chr_1
3chs_1
3chs_2
3cht_1
3chw_1
3cib_1
3cic_2
3cid_2
3cif_1
3cin_1
3cis_3
3cj3_1
3cjf_1
3cjg_1
3cjo_2
3cjq_3
3cjt_1
3ck7_2
3ckq_1
3ckt_1
3cku_1
3ckx_1
3cl1_1
3cl2_8
3clb_2
3clb_5
3clh_2
3clr_1
3cls_1
3clt_1
3clu_1
3clx_3
3cly_1
3cmc_1
3cmf_2
3cmp_1
3cmp_2
3cn8_1
3cnd_2
3cne_1
3cnj_1
3cnp_1
3cns_1
3cnt_1
3cnt_2
3co9_2
3cob_1
3cok_2
3cop_1
3cos_3
3cot_1
3cow_1
3cox_1
3coy_2
3coz_2
3cp6_1
3cp9_1
3cph_1
3cpo_1
3cpp_1
3cps_2
3cq5_1
3cqe_1
3cqu_1
3cqw_1
3cr0_1
3cr3_2
3cr7_1
3crl_1
3crz_2
3cs8_1
3cs9_1
3csc_1
3csd_1
3cse_1
3csj_1
3cso_2
3cth_1
3ctj_1
3cty_1
3cui_1
3cuk_2
3cuk_6
3cv2_1
3cv6_2
3cv7_1
3cv9_1
3cvk_2
3cvu_1
3cvy_1
3cw8_1
3cw9_2
3cwj_1
3cwk_1
3cwq_1
3cx4_1
3cx5_2
3cx6_1
3cx8_1
3cxh_1
3cxi_2
3cxw_1
3cy2_1
3cy3_1
3cyl_1
3cyw_1
3cyx_

3i60_1
3i64_1
3i68_1
3i6d_2
3i6i_1
3i6o_1
3i6q_1
3i6r_1
3i7e_1
3i7g_1
3i7i_1
3i7v_1
3i81_1
3i8a_1
3i8p_1
3i97_1
3i99_1
3i9j_1
3i9j_2
3i9k_1
3ia4_4
3iaa_2
3iae_2
3iaf_1
3iah_2
3iak_1
3ibd_1
3ibe_1
3ibq_1
3ic9_1
3icp_1
3icr_2
3icr_3
3ics_2
3ics_3
3ics_4
3ict_1
3ict_2
3ict_5
3icz_1
3icz_2
3id8_1
3idb_1
3idc_1
3ids_1
3ie3_2
3ied_1
3iei_1
3iej_1
3iel_1
3ies_1
3ieu_2
3iew_2
3iew_3
3iex_1
3if2_1
3if9_3
3ig6_1
3ig7_1
3ig8_1
3igg_1
3igo_1
3igp_1
3igv_2
3ih0_1
3ihg_1
3ihg_4
3ihk_3
3ihz_1
3ii4_1
3iid_1
3iif_3
3iiq_2
3iis_3
3iiu_1
3ijd_4
3ijo_2
3ijr_7
3ijw_2
3ijz_1
3ik1_1
3ik6_1
3ik7_4
3ik9_8
3ikt_1
3il1_1
3il4_4
3il5_4
3il6_1
3ilt_2
3ime_2
3img_3
3img_4
3imx_1
3in1_1
3in6_2
3in9_1
3ing_1
3inj_5
3inl_6
3inm_1
3inq_1
3inr_2
3int_1
3int_3
3inv_1
3inv_3
3inw_1
3inx_1
3iny_1
3iob_2
3ioc_1
3iod_2
3ioe_1
3iof_1
3iog_1
3iok_1
3iph_1
3ipk_1
3ipq_1
3ipt_1
3ipu_1
3ipy_1
3iqe_10
3iqe_5
3iqf_12
3iqg_1
3iqh_1
3iqi_1
3iqz_3
3irh_3
3irm_3
3irn_3
3iro_1
3irx_1
3is2_2
3is9_1
3ish_2
3isi_1
3isj_2
3iss_5
3itj_4
3itu

3oii_2
3oij_2
3oik_1
3oim_1
3oiu_1
3oiv_1
3oiw_1
3oix_1
3ojg_1
3oji_1
3ojl_2
3ojo_2
3ojw_1
3ojw_2
3ojx_1
3ojx_2
3ojx_3
3ok9_1
3okf_2
3okx_2
3ol5_1
3oll_2
3ols_2
3omu_2
3onw_1
3oog_1
3ooi_1
3oom_1
3op5_2
3opd_1
3opm_1
3opx_1
3oq1_1
3oq6_2
3oq6_3
3oqf_1
3oqk_2
3oqu_2
3orf_2
3orh_1
3ori_1
3ork_1
3orl_1
3orm_1
3orn_1
3oro_1
3orp_1
3ort_1
3orz_4
3os3_1
3os9_6
3osh_1
3otb_2
3otf_1
3otq_1
3otu_1
3otw_6
3otx_1
3ou2_1
3ou6_1
3ou7_2
3oum_1
3ov4_2
3ov7_2
3ova_1
3ovb_1
3ovm_1
3ovs_2
3ow3_1
3owa_2
3owh_1
3owj_1
3owk_1
3owl_1
3own_1
3own_2
3ows_2
3owu_1
3owx_2
3owy_1
3ox1_1
3ox2_1
3ox3_2
3ox4_1
3oxc_1
3oxf_2
3oxk_1
3oxv_2
3oxw_1
3oxx_2
3oy0_1
3oy4_1
3oyq_1
3oys_1
3oyz_1
3oz2_1
3ozb_2
3ozc_1
3ozd_2
3ozr_1
3ozs_1
3ozt_1
3ozu_1
3ozv_1
3ozw_1
3p0e_4
3p0f_1
3p0g_1
3p0h_2
3p0j_4
3p0p_1
3p0z_4
3p17_1
3p19_2
3p1d_1
3p23_1
3p2e_1
3p2h_1
3p2h_2
3p2k_2
3p2o_2
3p2v_1
3p3c_1
3p3e_1
3p3z_1
3p4t_1
3p5a_1
3p5k_1
3p5l_1
3p5p_1
3p5s_2
3p62_1
3p67_1
3p6m_1
3p6n_1
3p6o_1
3p6p_1
3p6t_1
3p6x_1
3p70_4
3p70_6
3p74_1
3p78_1

3tw6_6
3two_1
3twp_1
3tx1_1
3txz_1
3ty3_1
3tyb_2
3tyc_2
3tyd_1
3tye_1
3tyl_1
3tyl_2
3tym_2
3tym_3
3tyv_2
3tz3_2
3tza_1
3tzb_1
3tzl_1
3u04_1
3u0f_1
3u0z_2
3u10_1
3u11_1
3u1y_1
3u2c_1
3u2l_1
3u2m_1
3u2x_1
3u33_11
3u3o_2
3u4c_1
3u4l_1
3u4o_2
3u4r_2
3u4u_1
3u57_1
3u5s_1
3u5y_1
3u6w_2
3u72_1
3u78_1
3u7k_1
3u7l_1
3u81_1
3u88_1
3u8d_1
3u8f_1
3u8h_1
3u8j_1
3u8k_7
3u8l_10
3u8m_3
3u8n_9
3u8q_1
3u8w_1
3u9d_2
3u9e_2
3u9f_13
3u9n_1
3u9y_1
3ua1_1
3ua3_2
3ua5_2
3ua8_1
3uag_1
3uaw_1
3uax_1
3uay_1
3uaz_1
3ub5_1
3uba_1
3ubd_1
3ubm_4
3ucb_1
3uce_2
3ucl_1
3udj_1
3udk_1
3udn_1
3udq_1
3udz_2
3ue4_1
3ue6_4
3uel_1
3uf6_1
3uf7_1
3uf9_1
3ufg_1
3ufl_1
3ufn_1
3ufo_2
3ufo_3
3ufp_3
3ufp_4
3ufq_1
3ufq_2
3ufr_1
3ufr_2
3ufs_3
3ufs_4
3uft_2
3uft_3
3ufu_1
3ufu_2
3ufv_3
3ufv_4
3ufw_3
3ufw_4
3ufx_4
3ufy_1
3ug8_1
3ugc_1
3ugr_1
3uhl_1
3uhm_1
3uib_1
3uic_11
3uic_17
3uim_1
3uiv_1
3uix_1
3uj7_1
3uj8_1
3ujl_1
3uk6_7
3uka_3
3ukf_3
3ukf_8
3ukk_1
3ukk_4
3ukl_4
3uko_2
3ukr_1
3uku_1
3ule_1
3ule_3
3uli_1
3ulk_1
3um5_1
3um6_1
3um8_2
3

4bf1_1
4bf2_1
4bf6_1
4bf9_1
4bfd_1
4bfm_1
4bfp_2
4bfq_6
4bfr_1
4bfs_1
4bft_2
4bfu_1
4bfv_1
4bfw_1
4bfx_2
4bfy_2
4bfz_1
4bga_1
4bgb_2
4bge_1
4bgg_2
4bgh_1
4bgi_1
4bgi_5
4bgq_1
4bhw_1
4bhz_1
4bi0_1
4bi1_1
4bib_2
4bid_2
4bie_1
4bii_1
4bii_2
4bio_1
4bis_1
4biw_2
4bix_1
4biz_1
4bj8_13
4bj9_1
4bjb_1
4bjc_1
4bjk_4
4bjx_1
4bjz_1
4bk2_1
4bk3_1
4bkj_2
4bkq_1
4bkz_1
4bl1_1
4bl5_3
4blc_4
4blr_2
4bls_2
4blt_2
4blv_2
4blw_1
4bmo_1
4bmp_1
4bms_4
4bmv_2
4bmz_1
4bn1_1
4boe_1
4boy_2
4bpr_1
4bqg_1
4bqh_1
4bqj_1
4bqp_4
4bqp_7
4bqr_1
4bqs_2
4bqt_3
4bqu_2
4bqw_1
4bqy_1
4bqz_1
4br0_1
4br3_3
4br7_1
4brd_1
4brg_1
4bri_2
4brx_1
4bs4_1
4bti_1
4btj_1
4btj_2
4btk_1
4btm_1
4btt_1
4btu_2
4bu9_2
4bub_2
4bud_2
4bue_1
4buf_2
4bui_2
4bup_1
4bur_1
4bur_9
4bus_1
4but_2
4buu_2
4buv_2
4buw_1
4bux_1
4buy_2
4bv6_1
4bv9_2
4bva_2
4bvb_1
4bvb_2
4bvh_2
4bvh_3
4bvn_1
4bw1_1
4bw2_1
4bw3_1
4bw4_1
4bw9_1
4bwa_1
4bwx_2
4bx7_1
4bxk_1
4bxn_1
4bz7_3
4bz8_3
4bzb_1
4bzc_11
4bzn_1
4bzo_1
4bzq_2
4bzr_1
4c01_2
4c02_1
4c03_1
4c04_1
4c05_1
4c0b

4g6n_1
4g6o_1
4g73_1
4g73_3
4g74_1
4g77_1
4g7g_1
4g8b_1
4g8c_2
4g8j_3
4g8l_2
4g8z_1
4g95_1
4g9e_2
4g9k_2
4g9l_1
4ga3_1
4ga8_1
4gaa_2
4gab_1
4gae_1
4gah_1
4gap_1
4gap_2
4gav_2
4gb2_1
4gb9_1
4gbc_2
4gbd_1
4gbe_3
4gbi_2
4gbn_2
4gbr_1
4gc9_1
4gca_1
4gcm_2
4gcm_3
4gcx_1
4gd4_1
4gd9_4
4gda_1
4gdc_2
4gdc_7
4gdd_4
4gdd_6
4gde_1
4gdp_4
4gdy_2
4ge1_4
4ge4_1
4ge5_1
4ge7_1
4ge9_2
4geb_1
4gee_1
4gek_2
4gev_2
4gf5_17
4gf9_1
4gfg_1
4gfn_1
4gfo_1
4gfv_2
4gg1_1
4gg5_1
4gg7_1
4gg9_1
4ggl_1
4ggz_2
4gh1_1
4gh2_1
4gh3_1
4gh5_3
4gh6_2
4gi2_1
4gi4_1
4gi5_1
4gid_3
4gih_1
4gii_1
4giu_1
4giy_1
4gj2_1
4gj3_1
4gk2_1
4gk3_1
4gk4_1
4gkh_9
4gki_12
4gkm_1
4gkt_1
4gkv_1
4gl4_1
4glb_1
4gll_1
4glx_1
4gm0_1
4gm1_1
4gm4_1
4gm7_1
4gmg_3
4gmy_1
4gn6_1
4gn8_2
4gnc_2
4gnz_3
4go2_4
4goj_2
4gol_2
4gp6_1
4gpb_1
4gpj_1
4gqb_1
4gqe_2
4gqe_3
4gqi_2
4gqk_1
4gql_1
4gqr_1
4gqs_1
4gqt_2
4gr0_1
4gr1_1
4gr1_2
4gr3_1
4gr8_1
4gr9_1
4grb_1
4grk_1
4gs4_1
4gs8_2
4gsf_1
4gss_1
4gst_1
4gsu_2
4gsy_1
4gt3_1
4gt8_1
4gte_2
4gte_3
4gtl_3
4gtm_1
4gto

4l31_2
4l32_1
4l33_2
4l34_1
4l36_1
4l39_1
4l3j_1
4l3l_1
4l3q_1
4l42_1
4l43_1
4l44_1
4l45_1
4l49_1
4l4a_1
4l4b_1
4l4c_1
4l4d_1
4l4e_1
4l4f_1
4l4o_1
4l4s_2
4l4v_1
4l4x_1
4l52_1
4l5a_4
4l5b_2
4l5m_1
4l64_1
4l65_1
4l6g_2
4l6s_1
4l77_1
4l78_1
4l7f_1
4l7g_1
4l7h_1
4l7j_1
4l7s_2
4l7v_1
4l80_5
4l82_2
4l8g_1
4l8m_1
4l8u_1
4l8v_4
4l96_1
4l98_2
4l9i_2
4l9k_1
4l9q_1
4l9s_1
4l9w_1
4l9y_1
4l9z_6
4la0_2
4la1_1
4la3_1
4la7_1
4lay_1
4lb2_3
4lb9_1
4lbp_1
4lc7_1
4lcf_1
4lcg_1
4lch_1
4lcj_7
4lcn_1
4lco_1
4lcw_2
4ld2_1
4lde_1
4ldj_1
4ldk_1
4ldl_1
4lez_1
4lfg_4
4lfi_1
4lfv_1
4lg5_1
4lga_1
4lgg_1
4lgh_2
4lh7_1
4lht_2
4lhv_1
4lhw_1
4li6_1
4li7_2
4li8_1
4lii_1
4lis_1
4lis_2
4lj3_1
4lj3_2
4lj5_1
4lj6_1
4lj7_1
4lj8_1
4lj9_1
4lja_1
4lk3_9
4llk_1
4lm0_1
4lm4_1
4lm5_1
4lmc_1
4lmn_1
4lmu_1
4lna_1
4lnb_1
4lng_1
4lnk_6
4loc_1
4loc_2
4loh_1
4loi_1
4lol_1
4loo_1
4lop_1
4loq_1
4loy_1
4lpb_1
4lpk_2
4lr6_1
4lrg_1
4lrh_4
4lrj_2
4lrl_5
4lrl_6
4lrr_1
4lrw_2
4lrz_1
4lsa_1
4lsj_1
4lsl_1
4lsm_2
4lt6_2
4lts_2
4ltz_1
4lu3_1
4luc_2

4uux_2
4uwm_2
4ux6_1
4ux9_2
4uxj_6
4uxq_1
4uxx_1
4uy5_1
4uy6_1
4uya_1
4uyf_3
4uyg_3
4uyl_1
4uym_1
4uz6_2
4uzi_1
4uzq_1
4v01_2
4v02_1
4v06_1
4v0s_1
4v12_1
4v26_1
4v2g_1
4v2g_2
4v37_2
4v3i_1
4w5j_2
4w5k_1
4w5r_3
4w6z_2
4w9n_1
4w9n_6
4w9w_1
4wa7_1
4wa9_1
4was_1
4wb6_1
4wb9_1
4wbb_1
4wbd_1
4wbn_1
4wbo_4
4wct_2
4wcx_1
4wda_1
4wdb_1
4wdf_1
4wdg_1
4wec_2
4weq_1
4wev_1
4wf0_1
4wfr_1
4wh2_1
4wh3_1
4wji_1
4wkb_1
4wkq_1
4wlj_1
4wlo_3
4wlu_3
4wlv_2
4wm7_1
4wmz_1
4wo5_2
4wou_1
4wpu_2
4wq0_1
4wq4_1
4wq5_2
4wqm_1
4wri_1
4wrk_6
4ws0_1
4wso_2
4wsq_1
4wuc_1
4wud_1
4wuj_1
4wuo_2
4wvd_1
4ww0_2
4ww4_1
4ww9_1
4wx1_4
4wx2_2
4wxf_2
4wxg_2
4wxl_3
4wxx_1
4wzh_2
4wzy_1
4x0o_5
4x24_2
4x28_2
4x2d_1
4x2q_2
4x30_1
4x3l_2
4x3m_1
4x3q_1
4x3u_1
4x4j_1
4x4l_1
4x5d_1
4x5f_1
4x5g_2
4x5h_1
4x5i_1
4x5j_1
4x5s_2
4x6r_1
4x7u_2
4x7v_2
4x7w_2
4x7y_2
4x7z_2
4x81_1
4x86_1
4x8l_2
4x8o_1
4x9d_3
4x9m_1
4x9n_1
4xb1_1
4xb2_1
4xba_1
4xba_2
4xbf_1
4xbj_2
4xbo_1
4xc0_1
4xcj_1
4xcl_1
4xcv_1
4xcx_1
4xcz_1
4xcz_2
4xd1_2
4xd2_2
4xdm_1
4xe3_1

In [None]:
# Let us save the above files and store in a safe space
# DON'T USE

def archive_dir(folder, pattern, name):
    parent_dir = os.path.dirname(folder)
    folder = os.path.basename(folder)
    #     print(parent_dir, folder)
    if os.path.exists(os.path.join(parent_dir, name)):
        print("Warning:", name, "already exists in", parent_dir)
        inp = input("Do you want to overwrite existing file? (y/n): ")
        if inp[0].lower() == "n":
            return
    # Using only a single ! command since multiple ! spawn different bash shells

# For some reason, the below code is doubling the contents of the tar file
#     !cd $parent_dir; find $folder -name "$pattern" | tar --sort=name -I zstd -cf $name -T -; rsync -avP $name crvineeth97@ada:/share2/crvineeth97/compressed/scPDB; cd -
# To untar, use
# !tar -I zstd -xf $name


# archive_dir(raw_dir, "*", "raw.tar.zst")
# archive_dir(preprocessed_dir, "*.npy", "features_labels.tar.zst")
# archive_dir(preprocessed_dir, "*.npz", "preprocessed_chains.tar.zst")
# archive_dir(pssm_dir, "*", "pssm.tar.zst")

# ISSUES

<input type="checkbox"> Improve downloading PDB and fasta by using something that resumes downloads

<input type="checkbox"> Not sure if checking for a space in the residue is the best way of checking. Can use the code_with_modified_residues dictionary from NWalign.py (https://zhanglab.ccmb.med.umich.edu/NW-align/NWalign.py) (Should be OK) - Not Ok, it's missing some RNA sequences that start with an "X"

<input type="checkbox"> Make the pdb_id field in the preprocessed files into pdb_id_struct

