In [None]:
# This part of the code is used for msa_generation
# An scPDB.tar.gz file is already present in the repo and it contains sequence.fasta and downloaded.pdb files for all the scPDB data points
# Let us extract that into data/scPDB
# Most of the code is taken from Deepmsa https://zhanglab.ccmb.med.umich.edu/DeepMSA/ with some small changes
!tar xvzf scPDB.tar.gz -C ./data/
# Download uniref50.fasta and uniclust30_2017_10 into data folder
!aria2c -c -x 8 -s 8 -d "./data/" ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz
!aria2c -c -x 8 -s 8 -d "./data/" http://wwwuser.gwdg.de/~compbiol/uniclust/2017_10/uniclust30_2017_10_hhsuite.tar.gz
!tar xvzf ./data/uniref50.fasta.gz -C ./data/
!tar xvzf ./data/uniclust30_2017_10_hhsuite.tar.gz -C ./data/

In [3]:
# Some important constants
# ENSURE THAT YOU RUN THIS CODE CELL FIRST
import os

data_dir = os.path.abspath("./data")
raw_dir = os.path.join(data_dir, "scPDB")
splits_dir = "./splits"

In [None]:
# We need to generate MSAs for the protein sequences in the dataset
# For that, we need to split the sequence.fasta file into respective chain.fasta files
# Also, we need to remove the fasta files of DNA/RNA seqeuences

for file in sorted(os.listdir(raw_dir)):
    file = file.strip()
    pre = os.path.join(raw_dir, file)
    
    # Read SEQRES entries in PDB file to determine whether a chain
    # has a protein sequence or not
    pdb_file = os.path.join(pre, "downloaded.pdb")
    do_not_include = set()
    with open(pdb_file, "r") as f:
        line = f.readline()
        while line[:6] != "SEQRES":
            line = f.readline()
        while line[:6] == "SEQRES":
            chain_id = line[11]
            residue = line[19:22]
            # Generally DNA/RNA have 1 or 2-letter codes
            if " " in residue:
                do_not_include.add(chain_id)
            line = f.readline()
    
    fasta = os.path.join(pre, "sequence.fasta")
    with open(fasta, "r") as f:
        header = f.readline()
        while 1:
            chain_id = header[6:7]
            sequence = ""
            line = f.readline()
            while line != "" and line is not None and line[0] != ">":
                sequence += line.strip()
                line = f.readline()
            if chain_id not in do_not_include:
                with open(os.path.join(pre, chain_id + ".fasta"), "w") as hlp:
                    hlp.write(header)
                    hlp.write(sequence + "\n")
            if line == "" or line is None:
                break
            header = line

In [None]:
# In case you want to delete the generated fasta files from the above cell, use this
# for file in sorted(os.listdir(raw_dir)):
#     for fasta in glob(os.path.join(raw_dir, file.strip(), "?.fasta")):
#         os.remove(fasta)

In [None]:
# The fasta files generated will have a lot of common sequences
# To speed up MSA generation, let us create a unique file that has common sequences
# Then we can generate the MSAs for only the first chain in every line
from collections import defaultdict

sequences = defaultdict(list)
for file in sorted(os.listdir(raw_dir)):
    pre = os.path.join(raw_dir, file.strip())
    for fasta in sorted(os.listdir(pre)):
        if fasta[2:] != "fasta":
            continue
        chain_id = fasta[0]
        with open(os.path.join(pre, fasta)) as f:
            f.readline()
            sequence = f.readline().strip()
            # This choice was made so that rsync would work much better and easier
            sequences[sequence].append(file + "/" + chain_id + "*")

keys = list(sequences.keys())

with open(os.path.join(splits_dir, "unique"), "w") as f:
    for key in keys:
        line = ""
        for chain_id in sequences[key]:
            line += chain_id + " "
        f.write(line[:-1] + "\n")

# Let us split the MSAs into a 100 files so that they can all be run parallely
unique_ln = len(keys)
num_of_splits = 100
for i in range(0, unique_ln // num_of_splits + 1):
    with open(os.path.join(folder, str(i)), "w") as f:
        for key in keys[i * num_of_splits : min(unique_ln, (i + 1) * num_of_splits)]:
            f.write(sequences[key][0] + "\n")

In [None]:
# Ensure that boost regex library has been installed
# MSAs take a lot of time to generate. Assuming a SLURM Workload Manager on the cluster, pssm.sh and calculate_pssm.py have been written. Make changes accordingly to make it work
# Run pssm.sh