## Clustal W2 Alignment + MAli v1.31 Refinement

NOTE: This notebook assumes that ```01b - ClustalW2.ipynb``` has already been run to completion.

#### Imports

In [1]:
import os
import shutil
import subprocess
import time
import pandas as pd

In [2]:
from wrapped_scorer import WrappedScorer

In [3]:
STRATEGY_NAME = "ClustalW2-MAli-Refine"

#### Clustal W2

In [4]:
PROGRESSIVE_OUTPUT_FOLDER = "aligners/ClustalW2/output"

#### MAli v1.3

In [5]:
SEED = 9032025
ITERATIONS = 200
ITERATIVE_ALIGNER_NAME = "MAli-v1.31"
ITERATIVE_ALIGNER_PATH = "aligners/MAli-v1.31/MAli.exe"
ITERATIVE_OUTPUT_FOLDER = "refinement/MAli-v1.31/output"

In [6]:
# creating empty output folder
if os.path.exists(ITERATIVE_OUTPUT_FOLDER):
    shutil.rmtree(ITERATIVE_OUTPUT_FOLDER)
os.makedirs(ITERATIVE_OUTPUT_FOLDER)

#### BALIS-1

In [7]:
DATASET_NAME = "BALIS-1"
DATASET_FOLDER = "datasets/BALIS-1"
INPUT_FOLDER = f"{DATASET_FOLDER}/in"
REFERENCES_FOLDER = f"{DATASET_FOLDER}/ref"

In [8]:
# checking that testcases have been found
testcases = os.listdir(INPUT_FOLDER)
print(testcases[:10])

['BB11004', 'BB11005', 'BB11008', 'BB11009', 'BB11011', 'BB11014', 'BB11015', 'BB11016', 'BB11018', 'BB11019']


#### QScore

In [9]:
SCORER_PATH = "scorers/QScore/qscore.exe"

In [10]:
scorer = WrappedScorer(SCORER_PATH)

#### Performing 2-Stage Alignment

In [11]:
def perform_refinement_and_record_time(filename):

    input_path = f"{INPUT_FOLDER}/{filename}"
    prog_output_path = f"{PROGRESSIVE_OUTPUT_FOLDER}/{filename}"
    iter_output_path = f"{ITERATIVE_OUTPUT_FOLDER}/{filename}"
    
    iter_command = f"{ITERATIVE_ALIGNER_PATH} -input {prog_output_path} -output {iter_output_path} -seed {SEED} -iterations {ITERATIONS} -refine"

    start_time = time.perf_counter()
    
    subprocess.run(iter_command)

    end_time = time.perf_counter()
    
    time_in_milliseconds = (end_time - start_time) * 1000
    time_in_milliseconds = round(time_in_milliseconds, 0)

    return int(time_in_milliseconds)

In [12]:
def score_quality_of_produced_alignment(filename):

    test_path = f"{ITERATIVE_OUTPUT_FOLDER}/{filename}.faa"
    reference_path = f"{REFERENCES_FOLDER}/{filename}"
    score = scorer.score_testcase(test_path, reference_path)

    return score

In [13]:
def record_performance_on_testcase(filename):

    time_taken = perform_refinement_and_record_time(testcase)
    score = score_quality_of_produced_alignment(testcase)

    return f"{STRATEGY_NAME},{DATASET_NAME},{filename},{score},{time_taken}"
    

In [14]:
HEADER = "aligner,dataset,testcase,Q_score,time_elapsed_ms"
RECORDS = []
RECORDS.append(HEADER)

In [15]:
for testcase in testcases:
    record = record_performance_on_testcase(testcase)
    RECORDS.append(record)
    print(record)

ClustalW2-MAli-Refine,BALIS-1,BB11004,0.188,940
ClustalW2-MAli-Refine,BALIS-1,BB11005,0.4,1381
ClustalW2-MAli-Refine,BALIS-1,BB11008,0.643,1056
ClustalW2-MAli-Refine,BALIS-1,BB11009,0.339,884
ClustalW2-MAli-Refine,BALIS-1,BB11011,0.357,709
ClustalW2-MAli-Refine,BALIS-1,BB11014,0.818,1149
ClustalW2-MAli-Refine,BALIS-1,BB11015,0.685,888
ClustalW2-MAli-Refine,BALIS-1,BB11016,0.515,1204
ClustalW2-MAli-Refine,BALIS-1,BB11018,0.59,1838
ClustalW2-MAli-Refine,BALIS-1,BB11019,0.587,944
ClustalW2-MAli-Refine,BALIS-1,BB11021,0.35,396
ClustalW2-MAli-Refine,BALIS-1,BB11022,0.212,670
ClustalW2-MAli-Refine,BALIS-1,BB11025,0.237,342
ClustalW2-MAli-Refine,BALIS-1,BB11027,0.341,986
ClustalW2-MAli-Refine,BALIS-1,BB11029,0.503,412
ClustalW2-MAli-Refine,BALIS-1,BB11031,0.335,1154
ClustalW2-MAli-Refine,BALIS-1,BB11033,0.494,803
ClustalW2-MAli-Refine,BALIS-1,BB11038,0.554,1142
ClustalW2-MAli-Refine,BALIS-1,BB12005,0.985,587
ClustalW2-MAli-Refine,BALIS-1,BB12013,0.962,1236
ClustalW2-MAli-Refine,BALIS-1,BB1201

In [16]:
print(RECORDS[0])
print(RECORDS[-1])

aligner,dataset,testcase,Q_score,time_elapsed_ms
ClustalW2-MAli-Refine,BALIS-1,BB50014,0.922,968


#### Cleaning Up

ClustalW2 leaves .dnd guide tree files next to the input sequences, these are to be removed.

In [17]:
count_removed = 0
for filename in os.listdir(INPUT_FOLDER):
    if filename.endswith(".dnd"):
        file_path = os.path.join(INPUT_FOLDER, filename)
        os.remove(file_path)
        count_removed += 1

In [18]:
print(f"Cleaned {count_removed} guide tree files left in dataset folder.")

Cleaned 0 guide tree files left in dataset folder.


#### Writing Records to CSV

In [19]:
RESULTS_FILENAME = f"sbench_{STRATEGY_NAME}_on_{DATASET_NAME}.csv"
RESULTS_FILEPATH = f"results/{RESULTS_FILENAME}"

In [20]:
with open(RESULTS_FILEPATH, "w") as file:
    for record in RECORDS:
        file.write(record)
        file.write("\n")

In [21]:
print(f"Results written to: {RESULTS_FILEPATH}")

Results written to: results/sbench_ClustalW2-MAli-Refine_on_BALIS-1.csv


#### Previewing Results

In [22]:
df = pd.read_csv(RESULTS_FILEPATH)
df.head()

Unnamed: 0,aligner,dataset,testcase,Q_score,time_elapsed_ms
0,ClustalW2-MAli-Refine,BALIS-1,BB11004,0.188,940
1,ClustalW2-MAli-Refine,BALIS-1,BB11005,0.4,1381
2,ClustalW2-MAli-Refine,BALIS-1,BB11008,0.643,1056
3,ClustalW2-MAli-Refine,BALIS-1,BB11009,0.339,884
4,ClustalW2-MAli-Refine,BALIS-1,BB11011,0.357,709
