In [47]:
import pandas as pd
from Bio import AlignIO 
from augur.utils import write_json
from collections import defaultdict

In [55]:
mut_diffs = "../../../results/compare_cell_entry/mut_diffs.csv"
mut_diffs_df = pd.read_csv(mut_diffs)
mut_diffs_df.head()

Unnamed: 0,site,sequential_site,mutant,wildtype,region,293T_Mxra8,C636,293T_TIM1,293T_Mxra8 minus C636,293T_Mxra8 minus 293T_TIM1,C636 minus 293T_TIM1
0,-1(E3),1,I,M,E3,-7.541,-7.514,-7.502,0.0,0.0,0.0
1,-1(E3),1,M,M,E3,0.0,0.0,0.0,0.0,0.0,0.0
2,-1(E3),1,T,M,E3,-7.563,-7.541,-7.576,0.0,0.0,0.0
3,1(E3),2,S,S,E3,0.0,0.0,0.0,0.0,0.0,0.0
4,1(E3),2,W,S,E3,-1.375,-1.599,-1.122,0.224,-0.253,-0.477


In [56]:
alignment = "../../results/alignments/protein_no_outgroup.fa"
alignment_dict = dict()
for sequence in AlignIO.read("../../results/alignments/protein_no_outgroup.fa", "fasta"):
    alignment_dict[str(sequence.id)] = str(sequence.seq)
alignment_dict

{'S27-African-prototype_NC_004162': 'MEFIPTQTFYNRRYQPRPWTPRPTIQVIRPRPRPQRQAGQLAQLISAVNKLTMRAVPQQKPRKNRKNKKQKQKQQAPQNNTNQKKQPPKKKPAQKKKKPGRRERMCMKIENDCIFEVKHEGKVTGYACLVGDKVMKPAHVKGTIDNADLAKLAFKRSSKYDLECAQIPVHMKSDASKFTHEKPEGYYNWHHGAVQYSGGRFTIPTGAGKPGDSGRPIFDNKGRVVAIVLGGANEGARTALSVVTWNKDIVTKITPEGAEEWSLAIPVMCLLANTTFPCSQPPCIPCCYEKEPEETLRMLEDNVMRPGYYQLLQASLTCSPHRQRRSTKDNFNVYKATRPYLAHCPDCGEGHSCHSPVALERIRNEATDGTLKIQVSLQIGIGTDDSHDWTKLRYMDNHIPADAGRAGLFVRTSAPCTITGTMGHFILARCPKGETLTVGFTDSRKISHSCTHPFHHDPPVIGREKFHSRPQHGKELPCSTYVQSNAATAEEIEVHMPPDTPDRTLLSQQSGNVKITVNSQTVRYKCNCGGSNEGLITTDKVINNCKVDQCHAAVTNHKKWQYNSPLVPRNAELGDRKGKIHIPFPLANVTCMVPKARNPTVTYGKNQVIMLLYPDHPTLLSYRSMGEEPNYQEEWVTHKKEVVLTVPTEGLEVTWGNNEPYKYWPQLSANGTAHGHPHEIILYYYELYPTMTVVVVSVASFILLSMVGMAVGMCMCARRRCITPYELTPGATVPFLLSLICCIRTAKAATYQEAAVYLWNEQQPLFWLQALIPLAALIVLCNCLRLLPCCCKTLAFLAVMSIGAHTVSAYEHVTVIPNTVGVPYKTLVNRPGYSPMVLEMELLSVTLEPTLSLDYITCEYKTVIPSPYVKCCGTAECKDKNLPDYSCKVFTGVYPFMWGGAYCFCDAENTQLSEAHVEKSESCKTEFASAYRAHTASASAKLRVLYQGNNITVTAYANGDHAV

In [57]:
reference = "181-25_MW473668"
assert reference in alignment_dict, f"{reference} not in alignment_dict"
reference_seq = alignment_dict[reference]
reference_seq

'MEFIPTQTFYNRRYQPRPWTPRPTIQVIRPRPRPQRKAGQLAQLISAVNKLTMRAVPQQKPRKNRKNKKQKQKQQAPRNNMNQKKQPPKKKPAQKKKKPGRRERMCMKIENDCIFEVKHEGKVTGYACLVGDKVMKPAHVKGTIDNADLAKLAFKRSSKYDLECAQIPVHMKSDASKFTHEKPEGYYNWHHGAVQYSGGRFTIPTGAGKPGDSGRPIFDNKGRVVAIVLGGANEGARTALSVVTWNKDIVTKITPEGAEEWSLAIPVMCLLANTTFPCSQPPCTPCCYEKEPEKTLRMLEDNVMSPGYYQLLQASLTCSPRRQRRSIKDNFNVYKAIRPYLAHCPDCGEGHSCHSPVALERIRNEATDGTLKIQVSLQIGIKTDDSHDWTKLRYMDNHMPADAERARLFVRTSAPCTITGTMGHFILARCPKGETLTVGFTDGRKISHSCTHPFHHDPPVIGREKFHSRPQHGRELPCSTYAQSTAATAEEIEVHMPPDTPDRTLMSQQSGNVKITVNSQTVRYKCNCGDSNEGLTTTDKVINNCKVDQCHAAVTNHKKWQYNSPLVPRNAELGDRKGKVHIPFPLANVTCRVPKARNPTVTYGKNQVIMLLYPDHPTLLSYRNMGEEPNYQEEWVTHKKEIRLTVPTEGLEVTWGNNEPYKYWPQLSTNGTAHGHPHEIILYYYELYPTMTVVVVSVASFVLLSMVGVAVGMCMCARRRCITPYELTPGATVPFLLSLICCIRTAKAATYQEAAVYLWNEQQPLFWLQALIPLAALIVLCNCLRLLPCFCKTLTFLAVMSVGAHTVSAYEHVTVIPNTVGVPYKTLVNRPGYSPMVLEMELLSVTLEPTLSLDYITCEYKTVIPSPYVKCCGTAECKDKSLPDYSCKVFTGVYPFMWGGAYCFCDTENTQLSEAHVEKSESCKTEFASAYRAHTASASAKLRVLYQGNNVTVSAYANGDHAVTVKDAKFIVGPMSSAWTPFDNKIVVYKGDVYNMDYP

In [58]:
mutant_dict = dict()
for id, seq in alignment_dict.items():
    # Check that this is a valid alignment
    if len(seq) != len(reference_seq):
        raise ValueError(f"Length of {seq} does not match length of reference sequence {reference_seq}, alignment is corrupted")
    # Find mutations relative to the reference
    mutations = [(ref, i, alt) for i, (ref, alt) in enumerate(zip(reference_seq, seq)) if ref != alt]
    mutant_dict[id] = mutations
mutant_dict

{'S27-African-prototype_NC_004162': [('K', 36, 'Q'),
  ('R', 77, 'Q'),
  ('M', 80, 'T'),
  ('T', 283, 'I'),
  ('K', 293, 'E'),
  ('S', 304, 'R'),
  ('R', 320, 'H'),
  ('I', 326, 'T'),
  ('I', 336, 'T'),
  ('K', 381, 'G'),
  ('M', 398, 'I'),
  ('E', 403, 'G'),
  ('R', 406, 'G'),
  ('G', 442, 'S'),
  ('R', 473, 'K'),
  ('A', 481, 'V'),
  ('T', 484, 'N'),
  ('M', 505, 'L'),
  ('D', 529, 'G'),
  ('T', 535, 'I'),
  ('V', 579, 'I'),
  ('R', 591, 'M'),
  ('N', 623, 'S'),
  ('I', 641, 'V'),
  ('R', 642, 'V'),
  ('T', 668, 'A'),
  ('V', 701, 'I'),
  ('V', 708, 'M'),
  ('F', 789, 'C'),
  ('T', 794, 'A'),
  ('V', 801, 'I'),
  ('S', 880, 'N'),
  ('T', 906, 'A'),
  ('V', 950, 'I'),
  ('S', 953, 'T'),
  ('E', 1019, 'K'),
  ('S', 1033, 'A'),
  ('A', 1130, 'V'),
  ('V', 1212, 'A')],
 'SIMI-057_PV066168': [('P', 22, 'S'),
  ('V', 26, 'I'),
  ('K', 36, 'Q'),
  ('K', 62, 'R'),
  ('K', 72, 'R'),
  ('R', 77, 'Q'),
  ('M', 80, 'T'),
  ('K', 293, 'E'),
  ('S', 304, 'R'),
  ('R', 320, 'H'),
  ('I', 326, 'T'),

In [59]:
# Find the index in the reference sequence where the data starts
data_start =  1 
data_seq = "".join(mut_diffs_df[mut_diffs_df['mutant'] == mut_diffs_df['wildtype']].reset_index(drop=True).wildtype)
assert data_seq[data_start:] in reference_seq, "Reference sequence does not contain the data sequence."
reference_start = reference_seq.find(data_seq[data_start:])

# Update the mutant_dict to match the data positions
data_mutant_dict = dict()
for id, mutations in mutant_dict.items():
    data_mutants = []
    for ref, pos, alt in mutations:
        if pos >= reference_start and alt != "-":
            data_mutants.append((ref, (((pos - reference_start) + 1) + data_start), alt))
    data_mutant_dict[id] = data_mutants
data_mutant_dict

{'S27-African-prototype_NC_004162': [('T', 24, 'I'),
  ('K', 34, 'E'),
  ('S', 45, 'R'),
  ('R', 61, 'H'),
  ('I', 67, 'T'),
  ('I', 77, 'T'),
  ('K', 122, 'G'),
  ('M', 139, 'I'),
  ('E', 144, 'G'),
  ('R', 147, 'G'),
  ('G', 183, 'S'),
  ('R', 214, 'K'),
  ('A', 222, 'V'),
  ('T', 225, 'N'),
  ('M', 246, 'L'),
  ('D', 270, 'G'),
  ('T', 276, 'I'),
  ('V', 320, 'I'),
  ('R', 332, 'M'),
  ('N', 364, 'S'),
  ('I', 382, 'V'),
  ('R', 383, 'V'),
  ('T', 409, 'A'),
  ('V', 442, 'I'),
  ('V', 449, 'M'),
  ('F', 530, 'C'),
  ('T', 535, 'A'),
  ('V', 542, 'I'),
  ('S', 621, 'N'),
  ('T', 647, 'A'),
  ('V', 691, 'I'),
  ('S', 694, 'T'),
  ('E', 760, 'K'),
  ('S', 774, 'A'),
  ('A', 871, 'V'),
  ('V', 953, 'A')],
 'SIMI-057_PV066168': [('K', 34, 'E'),
  ('S', 45, 'R'),
  ('R', 61, 'H'),
  ('I', 67, 'T'),
  ('I', 77, 'T'),
  ('R', 147, 'G'),
  ('G', 183, 'S'),
  ('R', 214, 'K'),
  ('A', 222, 'V'),
  ('A', 229, 'T'),
  ('S', 259, 'G'),
  ('D', 270, 'S'),
  ('V', 320, 'I'),
  ('V', 329, 'A'),
  ('

In [60]:
column = "293T_Mxra8 minus C636"
lookup_dict = {}
for _, row in mut_diffs_df.iterrows():
    key = (row['wildtype'], row['sequential_site'], row['mutant'])
    lookup_dict[key] = row[column]
score_dict = {}
for id, mutations in data_mutant_dict.items():
    score = sum(lookup_dict.get((ref, pos, alt), 0) for ref, pos, alt in mutations)
    score_dict[id] = score
score_dict

{'S27-African-prototype_NC_004162': 0.9269999999999996,
 'SIMI-057_PV066168': 1.1130000000000007,
 'SIMI-058_PV066169': 1.1130000000000007,
 'CHIKV-NIHPAK-02-2024_PV054360': 0.33700000000000085,
 'CHIKV-NIHPAK-03-2024_PV054361': 0.19800000000000081,
 'CHIKV-NIHPAK-04-2024_PV054362': 0.19800000000000081,
 'CHIKV-NIHPAK-05-2024_PV054363': 0.19800000000000081,
 'CHIKV-NIHPAK-06-2024_PV054364': 0.19800000000000081,
 'CHIKV-NIHPAK-07-2024_PV054365': 0.19800000000000081,
 '11GLH-2019_PV022110': 1.1130000000000007,
 '12LSK-2019_PV022111': 1.1130000000000007,
 '13OY-2019_PV022112': 0.9490000000000005,
 '14JXB-2019_PV022113': 1.1130000000000007,
 '15XWH-2019_PV022114': 1.1890000000000005,
 '16WLP-2019_PV022115': 1.1130000000000007,
 '18BAH-2019_PV022116': 1.6749999999999998,
 '19WCJ-2019_PV022117': 0.9470000000000005,
 '20NXC-2019_PV022118': 0.8900000000000006,
 '21ZJJ-2019_PV022119': 1.1130000000000007,
 '22ZH-2019_PV022120': 1.1130000000000007,
 '23TWJ-2020_PV022121': 1.1130000000000007,
 '24

In [61]:
# Format the output as a dictionary
output_json = "./test.json"
output_dict = {
        "nodes": defaultdict(dict)
    }
for id, score in score_dict.items():
    output_dict["nodes"][id] = {
        column: score
    }
write_json(output_dict, output_json)

In [53]:
min(v for v in score_dict.values())

-8.298999999999996