In [None]:

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Check if the definitive_top_100_candidates_v3.csv file exists
files = list(Path('.').glob('*.csv'))
print("CSV files in directory:")
for f in files:
    print(f"  {f.name}")


CSV files in directory:


In [None]:

# The file doesn't exist, so we need to check what files we have
# and potentially load the data from the primary datasets
import os
all_files = os.listdir('.')
print("All files in working directory:")
for f in sorted(all_files):
    print(f"  {f}")


All files in working directory:
  .config
  .kernel_tmp
  2VSM.pdb
  Surface-Plasmon-Resonance-Adaptyv-Bio-Docs.pdf
  notebook.ipynb


In [None]:

# The required files are not present in the working directory
# According to my protocol, I cannot fabricate data
# I need to clearly state this limitation and end the analysis

print("=" * 80)
print("ANALYSIS CANNOT PROCEED")
print("=" * 80)
print("\nREASON:")
print("The required input file 'definitive_top_100_candidates_v3.csv' does not exist")
print("in the working directory. This file was supposed to be created in a previous")
print("task (as stated in the research objective).")
print("\nAdditionally, the following essential data files mentioned in the dataset")
print("description are also missing:")
print("  - allosteric_epitope_111_residues.txt (111-residue allosteric epitope)")
print("  - hybrid_3.0_designs.csv (triple-mutant design candidates)")
print("  - hybrid_2.0_designs.csv (double-mutant design candidates)")
print("\nWHAT IS NEEDED:")
print("To perform this analysis, I require:")
print("  1. The definitive_top_100_candidates_v3.csv file from the previous task, OR")
print("  2. The source design files (hybrid_3.0_designs.csv, hybrid_2.0_designs.csv)")
print("     AND the allosteric_epitope_111_residues.txt file to re-create it")
print("\nWITHOUT THESE FILES:")
print("I cannot identify the top 10 unique 9-mer motifs or calculate their MJ")
print("interaction scores against the allosteric epitope. Fabricating data would")
print("violate scientific integrity.")
print("=" * 80)


ANALYSIS CANNOT PROCEED

REASON:
The required input file 'definitive_top_100_candidates_v3.csv' does not exist
in the working directory. This file was supposed to be created in a previous
task (as stated in the research objective).

Additionally, the following essential data files mentioned in the dataset
description are also missing:
  - allosteric_epitope_111_residues.txt (111-residue allosteric epitope)
  - hybrid_3.0_designs.csv (triple-mutant design candidates)
  - hybrid_2.0_designs.csv (double-mutant design candidates)

WHAT IS NEEDED:
To perform this analysis, I require:
  1. The definitive_top_100_candidates_v3.csv file from the previous task, OR
  2. The source design files (hybrid_3.0_designs.csv, hybrid_2.0_designs.csv)
     AND the allosteric_epitope_111_residues.txt file to re-create it

WITHOUT THESE FILES:
I cannot identify the top 10 unique 9-mer motifs or calculate their MJ
interaction scores against the allosteric epitope. Fabricating data would
violate scientific in

In [None]:

# The external database search did not return the specific project files
# However, I did get the NiV-G protein sequence from UniProt (Q9IH62)
# Let me check if I can extract any useful information from the PDB file

# Load the UniProt data to get the full NiV-G sequence
import json

uniprot_file = "downloads/uniprot/20251107-194624-aba92c.jsonl"
with open(uniprot_file, 'r') as f:
    uniprot_data = json.load(f)

# Find the NiV-G glycoprotein entry (Q9IH62)
niv_g_entry = None
for entry in uniprot_data['results']:
    if entry['primaryAccession'] == 'Q9IH62':
        niv_g_entry = entry
        break

if niv_g_entry:
    print("Found NiV-G Glycoprotein (Q9IH62):")
    print(f"Full name: {niv_g_entry['proteinDescription']['recommendedName']['fullName']['value']}")
    print(f"Length: {niv_g_entry['sequence']['length']} amino acids")
    print(f"\nSequence:")
    print(niv_g_entry['sequence']['value'])
else:
    print("NiV-G entry not found in UniProt data")


Found NiV-G Glycoprotein (Q9IH62):
Full name: Glycoprotein G
Length: 602 amino acids

Sequence:
MPAENKKVRFENTTSDKGKIPSKVIKSYYGTMDIKKINEGLLDSKILSAFNTVIALLGSIVIIVMNIMIIQNYTRSTDNQAVIKDALQGIQQQIKGLADKIGTEIGPKVSLIDTSSTITIPANIGLLGSKISQSTASINENVNEKCKFTLPPLKIHECNISCPNPLPFREYRPQTEGVSNLVGLPNNICLQKTSNQILKPKLISYTLPVVGQSGTCITDPLLAMDEGYFAYSHLERIGSCSRGVSKQRIIGVGEVLDRGDEVPSLFMTNVWTPPNPNTVYHCSAVYNNEFYYVLCAVSTVGDPILNSTYWSGSLMMTRLAVKPKSNGGGYNQHQLALRSIEKGRYDKVMPYGPSGIKQGDTLYFPAVGFLVRTEFKYNDSNCPITKCQYSKPENCRLSMGIRPNSHYILRSGLLKYNLSDGENPKVVFIEISDQRLSIGSPSKIYDSLGQPVFYQASFSWDTMIKFGDVLTVNPLVVNWRNNTVISRPGQSQCPRFNTCPEICWEGVYNDAFLIDRINWISAGVFLDSNQTAENPVFTVFKDNEILYRAQLASEDTNAQKTITNCFLLKNKIWCISLVEIYDTGDNVIRPKLFAVKIPEQCT


In [None]:

# I have the NiV-G sequence, but I still need the critical project files
# Let me try to parse the 2VSM.pdb file to see if I can extract the allosteric epitope
# The dataset description mentions the competitive epitope sequence: SCSRGVSKQRIIGVGEVLDR

# Let's see if this sequence is in the NiV-G protein
niv_g_sequence = niv_g_entry['sequence']['value']
competitive_epitope = "SCSRGVSKQRIIGVGEVLDR"

if competitive_epitope in niv_g_sequence:
    start_pos = niv_g_sequence.index(competitive_epitope)
    print(f"Found competitive epitope at position {start_pos} in NiV-G sequence")
    print(f"Sequence: {competitive_epitope}")
else:
    print("Competitive epitope not found in NiV-G sequence")
    
# But I still cannot proceed without the design files and the 111-residue allosteric epitope
print("\n" + "=" * 80)
print("CRITICAL FILES STILL MISSING")
print("=" * 80)
print("Even with the NiV-G sequence, I cannot proceed without:")
print("  1. definitive_top_100_candidates_v3.csv (top motif candidates)")
print("  2. allosteric_epitope_111_residues.txt (the 111-residue target epitope)")
print("  3. The design strategy files to understand motif construction")


Found competitive epitope at position 238 in NiV-G sequence
Sequence: SCSRGVSKQRIIGVGEVLDR

CRITICAL FILES STILL MISSING
Even with the NiV-G sequence, I cannot proceed without:
  1. definitive_top_100_candidates_v3.csv (top motif candidates)
  2. allosteric_epitope_111_residues.txt (the 111-residue target epitope)
  3. The design strategy files to understand motif construction
