In [1]:
# Package imports
import sys
import numpy as np
import pandas as pd
import os
from Bio import AlignIO

# Local imports
current_dir = os.path.abspath('')
project_root = os.path.abspath(os.path.join(current_dir, '../../../../'))
sys.path.append(project_root)
from src.helper_functions import encoding_alignment, hmmaligning

In [2]:
# Define the input parameters
hmm_model = 'GASP'
fasta_name = 'seqs'
raw_encoding_name = 'blosum62amb'
threshold = 1

In [3]:
# Check if the fasta file exists
if not os.path.exists(f'../../../../Data/fasta/{fasta_name}.faa'):
    print(f'Error: {fasta_name}.faa does not exist')
    sys.exit(1)

# Check if alignment file exists
if not os.path.exists(f'alignments/{fasta_name}_{hmm_model}.afa'):
    print(f'Alignment does not exist. Running alignment...')
    hmmaligning([f'../../../../Data/fasta/{fasta_name}.faa'], hmm_model, f'{fasta_name}',threshold=threshold)


Alignment does not exist. Running alignment...
Skipped 0 sequences due to too many gaps


In [5]:
align = AlignIO.read(f'alignments/{fasta_name}_{hmm_model}.afa', "fasta")
raw_encoding = pd.read_csv(f'../../encodings/raw/{raw_encoding_name}.csv',index_col=0)

In [6]:
full_encoding = encoding_alignment(align, raw_encoding)
nterm_encoding = encoding_alignment(align, raw_encoding, remove_Cterm=True)

In [7]:
outpath = f'../../encodings/{raw_encoding_name}_alignment'
full_encoding.to_csv(outpath+'.tsv',sep='\t')
nterm_encoding.to_csv(outpath+'_nterm.tsv',sep='\t')