In [None]:

# Load necessary libraries and examine the available data files
import pandas as pd
import numpy as np
import os
from pathlib import Path

# List all files in the working directory
print("Files in working directory:")
for file in sorted(os.listdir('.')):
    if not file.startswith('.'):
        print(f"  {file}")


Files in working directory:
  2VSM.pdb
  Surface-Plasmon-Resonance-Adaptyv-Bio-Docs.pdf
  cdr_library_summary.csv
  notebook.ipynb


In [None]:

# Load the CDR library summary to understand the structure
cdr_library = pd.read_csv('cdr_library_summary.csv')
print("CDR Library Summary:")
print(cdr_library.head(15))
print(f"\nTotal CDR sequences: {len(cdr_library)}")


CDR Library Summary:
   Antibody CDR Chain_Type           Sequence  Length           PDB_IDs  \
0       1E5  H1      Heavy         GGSISDTYRW      10  8K0C, 8K0D, 8XC4   
1       1E5  H2      Heavy           IYGSATST       8  8K0C, 8K0D, 8XC4   
2       1E5  H3      Heavy  ARDYQYYYSGSYPTPHN      17  8K0C, 8K0D, 8XC4   
3       1E5  L1      Light       RASQGIIDYLSW      12  8K0C, 8K0D, 8XC4   
4       1E5  L2      Light           TASNLESG       8  8K0C, 8K0D, 8XC4   
5       1E5  L3      Light          LQGYTTPYT       9  8K0C, 8K0D, 8XC4   
6      14F8  H1      Heavy         GFSLTSYDIS      10              8JA5   
7      14F8  H2      Heavy           WTGGVTNY       8              8JA5   
8      14F8  H3      Heavy              VREGD       5              8JA5   
9      14F8  L1      Light       RSSQSIVHSNGN      12              8JA5   
10     14F8  L2      Light           QLLIYKVS       8              8JA5   
11     14F8  L3      Light          FQASHVPYT       9              8JA5   

   

In [None]:

# Check if the top_50_nivg_binder_designs_final.csv file exists
# If not, we need to understand what files are available
import glob

# Look for any CSV files that might contain the design data
csv_files = glob.glob('*.csv')
print("Available CSV files:")
for f in csv_files:
    print(f"  {f}")

# Look for any FASTA files
fasta_files = glob.glob('*.fasta') + glob.glob('*.fa')
print("\nAvailable FASTA files:")
if fasta_files:
    for f in fasta_files:
        print(f"  {f}")
else:
    print("  None found")


Available CSV files:
  cdr_library_summary.csv

Available FASTA files:
  None found


In [None]:

# The required files are not in the directory, but they are listed as derived artifacts.
# This suggests they may need to be retrieved or may have been uploaded under different names.
# Let me check for any other files that might contain this data, including hidden files.

# Check for all files including hidden ones
all_files = []
for root, dirs, files in os.walk('.'):
    for file in files:
        if root == '.':
            all_files.append(file)
        else:
            all_files.append(os.path.join(root, file))

print("All files in working directory:")
for f in sorted(all_files):
    if 'ipynb_checkpoints' not in f:
        print(f"  {f}")
        
# Check if there are any subdirectories
print("\nSubdirectories:")
subdirs = [d for d in os.listdir('.') if os.path.isdir(d) and not d.startswith('.')]
if subdirs:
    for d in subdirs:
        print(f"  {d}")
else:
    print("  None found")


All files in working directory:
  ./.config/matplotlib/matplotlibrc
  2VSM.pdb
  Surface-Plasmon-Resonance-Adaptyv-Bio-Docs.pdf
  cdr_library_summary.csv
  notebook.ipynb

Subdirectories:
  None found


In [None]:

# The required input files (top_50_nivg_binder_designs_final.csv and known_niv_binders.fasta)
# are not present in the working directory. These files are listed as derived artifacts
# from previous analyses but are not available.

# I cannot proceed with this analysis without the primary input data.
# According to the core principles:
# "If a request is beyond your capabilities or the scope of provided data, 
# you must state this clearly and concisely."

print("=" * 80)
print("CRITICAL DATA AVAILABILITY ISSUE")
print("=" * 80)
print("\nI cannot complete this analysis because the required input files are missing:")
print("\n1. top_50_nivg_binder_designs_final.csv")
print("   - Status: Listed as a derived artifact but not present in working directory")
print("   - Required for: The 50 candidate binder designs to be filtered")
print("\n2. known_niv_binders.fasta")
print("   - Status: Referenced in task description but not present")
print("   - Required for: Database of known binder sequences for comparison")
print("\n" + "=" * 80)
print("ANALYSIS CANNOT PROCEED")
print("=" * 80)
print("\nTo complete this task, both files must be provided in the working directory.")


CRITICAL DATA AVAILABILITY ISSUE

I cannot complete this analysis because the required input files are missing:

1. top_50_nivg_binder_designs_final.csv
   - Status: Listed as a derived artifact but not present in working directory
   - Required for: The 50 candidate binder designs to be filtered

2. known_niv_binders.fasta
   - Status: Referenced in task description but not present
   - Required for: Database of known binder sequences for comparison

ANALYSIS CANNOT PROCEED

To complete this task, both files must be provided in the working directory.
