## MSA

In [3]:
from Bio import AlignIO
from pathlib import Path
import subprocess
from Bio import SeqIO
import os
import re
import json
from Bio.PDB.PDBParser import PDBParser

In [5]:
# Three letter to one letter code for amino acid
d = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
    'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 
     'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 
     'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}
d_flip = {v:k for k,v in d.items()}

In [23]:
# https://github.com/choderalab/asapdiscovery/blob/seq-alignment/asapdiscovery-seqalign/align_code.ipynb
def multi_seq_alignment(input_file, output_file):
    # Run MAFFT in command line
    cmd = f"mafft {input_file} > {output_file}"
    subprocess.run(cmd, shell=True)

    # Read in the alignment to 
    align = AlignIO.read(output_file, "fasta")
    return align, output_file
    
# Shove in the alignIO sequences from mafft and get the new number for each amino acid in the sequence
def get_new_renumber(align):
    res_num = dict()
    
    # Reference is the first sequence
    ref = align[0]
    # Initialize annotation for reference sequence, assume first residue is #1
    ref.letter_annotations["resnum"] = range(1,len(ref)+1)
    # Save the residue number
    res_num[ref.id] = list(ref.letter_annotations["resnum"])
    
    # Get a list of all other sequences to map back to the reference
    to_renumber = align[1:]
    for rec in to_renumber:
        # Initiate a new annotation for the sequences
        rec.letter_annotations["resnum"]=[None]*len(rec)
        # Set new residue numbers in rec based on alignment
        reslist = [[i,ref.letter_annotations["resnum"][i]] for i in range(len(ref)) if rec[i] != '-']
        for [i,r] in reslist:
            rec.letter_annotations["resnum"][i]=r
        # Set new residue numbers in the structure
        newresnums = [i for i in rec.letter_annotations["resnum"][:] if i != None]
        # Save to dict
        res_num[rec.id] = newresnums
    # Return the residue numbers of the aligned files
    # Dictionary with sequence id as the key and the residue numbers in a list as the value
    return res_num

# Input: 
# seq = sequence
# dic = amino acid dictionary that maps the three letter code to one letter code
# seq_num = the residue number in just the sequence
# chain = the chain the residue is on
# align_num = the residue number asigned to the residue from the mafft msa renumbering (to the canonical number) 
def triple_aa_renumber(seq, dic, seq_list, chain, align_list):   
    res = dict()
    for idx, char in enumerate(seq):
        code = dic.get(char, 'XXX')
        res[(code,seq_list[idx],chain)] = align_list[idx]
    return res

# change the triple amino acid to single amino acid
def trip_to_single_aa(trip_aa):
    single_aa = d[trip_aa]
    return single_aa

# get id of Pdb residues from the pdb file and save in dictionary by the sequence number
def get_pdb_ids(pdb_file):
    structure = PDBParser().get_structure('prot', pdb_file)   
    pdb_id = dict()
    count = 1
    for model in structure:
        for chain in model:
            chain_id = chain.get_full_id()[2]
            for i in chain.get_residues():
                resname = i.resname
                resnum = i.get_full_id()[3][1]
                id = (resname, resnum, chain_id)
                pdb_id[count] = id
                count += 1
    return pdb_id
    
# get id of Pdb residues from the pdb file and save in dictionary by the canonical number
def get_canon_ids(pdb_file, canon_nums):
    structure = PDBParser().get_structure('prot', pdb_file)   
    pdb_id = dict()
    count = 0
    for model in structure:
        for chain in model:
            chain_id = chain.get_full_id()[2]
            for i in chain.get_residues():
                resname = i.resname
                resnum = i.get_full_id()[3][1]
                id = (resname, resnum, chain_id)
                pdb_id[canon_nums[count]] = id
                count += 1
    return pdb_id

## Pandas dataframe made with the indexes as the canonical number of the residue
## The columns are the number of residue within the sequence and the pdb residue id

In [29]:
# MSA with MAFFT
local_path = Path('/home/pengs/fold_zika')
align, out_file = multi_seq_alignment(local_path/"clear_blank_seqs_found.txt", local_path/"output.fasta")

# Read in alignment file and renumber based on the reference to canonical number
align_num = get_new_renumber(align)


nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: -1 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
    1 / 8
done.

Constructing a UPGMA tree (efffree=0) ... 
    0 / 8
done.

Progressive alignment 1/2... 
STEP     7 / 7 
done.

Making a distance matrix from msa.. 
    0 / 8
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 8
done.

Progressive alignment 2/2... 
STEP     7 / 7 
done.

disttbfast (aa) Version 7.525
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It tends to insert more gaps into gap-rich regions than previous versions.
To disable this change, add the --leavegappyregion option.


In [14]:
prot_dir = Path('/home/pengs/fold_zika/only_prot')

In [24]:
# Map residue id to canonical number
msa_map = dict()
for id in current_num:
    match = re.search(r'ref\|(.*)\|',id)
    name = match.group(1)
    # Get the pdb ids of each protein
    prot_file = name + '.pdb'
    msa_map[name] = get_canon_ids(prot_dir/prot_file, align_num[id])



In [28]:
with open('/home/pengs/fold_zika/oddt_dock/msa_map.json', 'w') as f:
    json.dump(msa_map, f)

## Generate sequences to put into Colabfold

In [12]:
# Read sequence to be put into colabfold
local_path = Path('/home/pengs/fold_zika')
align_file = local_path/"output.fasta"
fasta_sequences = list(SeqIO.parse(open(align_file),'fasta'))

# Current residue numbers
current_num = dict()
current_seq = dict()

for fasta in fasta_sequences:
    current_num[fasta.id] = list(range(1,len(fasta)+1))
    current_seq[fasta.id] = str(fasta.seq)

In [13]:
current_seq

{'ref|zikv_ns2b3|': 'DMYIERAGDITWEKDAEVTGNSPRLDVALDESGDFSLVEGETTDGVYRVMTRRLLGSTQVGVGVMQEGVFHTMWHVTKGAALRSGEGRLDPYWGDVKQDLVSYCGPWKLDAAWDGLSEVQLLAVPPGERAKNIQTLPGIFKTKDGDIGAVALDYPAGTSGSPILDKCGRVIGLYGNGVVIKNGSYVSAITQGKRE',
 'ref|NP_776018.1|': '--------------------------------------KGDTTTGVYRIMTRGLLGSYQAGAGVMVEGVFHTLWHTTKGAALMSGEGRLDPYWGSVKEDRLCYGGPWKLQHKWNGHDEVQMIVVEPGKNVKNVQTKPGVFKTPEGEIGAVTLDYPTGTSGSPIVDKNGDVIGLYGNGVIMPNGSYISAIVQGER-',
 'ref|YP_001527884.1|': '--------------------------------------KGDTTTGVYRIMTRGLLGSYQAGAGVMVEGVFHTLWHTTKGAALMSGEGRLDPYWGSVKEDRLCYGGPWKLQHKWNGQDEVQMIVVEPGKNVKNVQTKPGVFKTPEGEIGAVTLDFPTGTSGSPIVDKNGDVIGLYGNGVIMPNGSYISAIVQGER-',
 'ref|NP_740321.1|': '------------------------------------------SEGVYRIMQRGLFGKTQVGVGIHMEGVFHTMWHVTRGSVICHETGRLEPSWADVRNDMISYGGGWRLGDKWDKEEDVQVLAIEPGKNPKHVQTKPGLFKTLTGEIGAVTLDFKPGTSGSPIINRKGKVIGLYGNGVVTKSGDYVSAITQAER-',
 'ref|NP_722463.1|': '-------------------------------------------DGIYRILQRGLLGRSQVGVGVFQEGVFHTMWHVTRGAVLMYQGKRLEPSWASV

In [14]:
zikv_chainA = 'DMYIERAGDITWEKDAEVTGNSPRLDVALDESGDFSLVE'
def get_remaining_portion(front_portion, full_string):
    pattern = re.escape(front_portion) + '(.*)'
    match = re.match(pattern, full_string)
    if match:
        return match.group(1)
    else:
        return None
zikv_chainB = get_remaining_portion(zikv_chainA, current_seq['ref|zikv_ns2b3|'])

In [None]:
id_seqs = []
regex = r'ref\|(\w+\.\d+)\|'
for seq_id in current_seq:
    match = re.match(regex, seq_id)
    sequence = current_seq[seq_id]
    id_seqs.append((match.group(1)+"_{}",sequence))
id_seqs

In [16]:
import csv
# Output the previous thing to csv format
# Specify the file path
file_path = "arborviruses.csv"

# Write data to CSV file
with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['id', 'sequence'])  # Write header
    writer.writerow(['zikv_ns2b3',f'{zikv_chainA}:{zikv_chainB}'])  

### Would also like to have ns2b structures
1. query ns2b sequences of the same organism
2. concatenate the sequence as chain onto the ns3 chain of the same organism
3. put into mafft to get into matching length and sequence
4. put into csv and run colabfold
5. dock and evaluate

## Active Residues ID 
### I dont think I can prep structures like this for docking (seem like docking would call protein-prep first before actually docking with posit)

## I think I can just do the prep and docking like the cross docking workflow but prep the protein and ligand with self-identified way (so can prepare the protein without the 
### Get the ligands from sdf to smi to dock(could put all the ligands into one file (smi) and dock it
### could provide multiple reference by just putting in the original protein file ? (may be better to do this individually)
### I really don't think trying to chuck this into cross-docking will work out well for me