# Notebook to collect and process the new data 2025 Januari

### Script to download the new data from the inphared GitHub Page 
(https://github.com/RyanCook94/inphared)

In [None]:
import os
import requests
import gzip
import shutil
import tarfile

# List of file names and their URLs
files = [
    ("1Jan2025_phages_downloaded_from_genbank.gb", "https://millardlab-inphared.s3.climb.ac.uk/1Jan2025_phages_downloaded_from_genbank.gb.gz")
]

# Directory to save the files
download_dir = "downloads"
os.makedirs(download_dir, exist_ok=True)

# Download and decompress all files
for filename, url in files:
    compressed_file_path = os.path.join(download_dir, filename + ".gz")
    decompressed_file_path = os.path.join(download_dir, filename)
    
    print(f"Downloading {filename}...")
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        # Save the file
        with open(compressed_file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive chunks
                    f.write(chunk)
        print(f"Saved compressed file: {compressed_file_path}")

        # Handle decompression
        print(f"Decompressing {filename}...")
        with gzip.open(compressed_file_path, "rb") as gz:
            with open(decompressed_file_path, "wb") as f_out:
                shutil.copyfileobj(gz, f_out)
        print(f"Decompressed to: {decompressed_file_path}")
        os.remove(compressed_file_path)  # Optional: Remove .gz file after decompression
    else:
        print(f"Failed to download {filename}. Status code: {response.status_code}")

print("All downloads and decompression completed.")


### Script to process the new data, using the annotation to extract RBP and non-RBP proteins

In [1]:
import re
import csv
from Bio import SeqIO
from Bio.Seq import UndefinedSequenceError
import os

os.makedirs("2025_data", exist_ok=True)

# File paths
genbank_file = "downloads/1Jan2025_phages_downloaded_from_genbank.gb"
rbp_csv = "2025_data/annotated_RBPs_2025-01.csv"
others_csv = "2025_data/annotated_nonRBPs_2025-01.csv"

# Keywords for RBP detection and exclusion
rbp_keywords = re.compile(
    r"tail.?(?:spike|fiber|fibre)|receptor-binding protein|receptor-recognizing protein",
    re.IGNORECASE,
)
rbp_exclude_keywords = {"adaptor", "wedge", "baseplate", "hinge", "connector", "structural",
                        "component", "assembly", "chaperone", "attachment", "capsid", "proximal", "measure"}
other_exclude_keywords = {"probable", "probably", "uncharacterized", "uncharacterised",
                           "putative", "hypothetical", "unknown", "predicted"}

# Function to validate RBP sequences
def is_valid_rbp(seq, annotation):
    return (
        200 <= len(seq) <= 1500 and
        "X" not in seq and
        not any(keyword in annotation.lower() for keyword in rbp_exclude_keywords)
    )

# Function to validate "Others" sequences
def is_valid_other(seq, annotation):
    return (
        len(seq) > 30 and
        "X" not in seq and
        not any(keyword in annotation.lower() for keyword in other_exclude_keywords)
    )

# Process GenBank file
rbp_data = []
others_data = []

# Sets to track unique sequences
unique_rbp_seqs = set()
unique_other_seqs = set()

for record in SeqIO.parse(genbank_file, "genbank"):
    phage_id = record.id
    organism = record.annotations.get("organism", "Unknown")
    record_date = record.annotations.get("date", "Unknown")
    record_date = "-".join(record_date.split("-")[1:]) if record_date != "Unknown" else record_date


    for feature in record.features:
        if feature.type == "CDS":
            qualifiers = feature.qualifiers
            protein_id = qualifiers.get("protein_id", ["Unknown"])[0]
            host = qualifiers.get("host", ["Unknown"])[0]
            protein_name = qualifiers.get("product", ["Unknown"])[0]
            protein_seq = qualifiers.get("translation", [""])[0]

            # Extract DNA sequence, handle undefined cases
            try:
                dna_seq = str(feature.location.extract(record).seq)
            except UndefinedSequenceError:
                dna_seq = "Undefined"

            # Classify as RBP or Others
            if rbp_keywords.search(protein_name):  # Matches RBP keywords
                if is_valid_rbp(protein_seq, protein_name):
                    if protein_seq not in unique_rbp_seqs:
                        rbp_data.append({
                            "phage_id": phage_id,
                            "protein_id": protein_id,
                            "Organism": organism,
                            "Host": host,
                            "ProteinName": protein_name,
                            "ProteinSeq": protein_seq,
                            "DNASeq": dna_seq,
                            "RecordDate": record_date,
                        })
                        unique_rbp_seqs.add(protein_seq)
            else:  # Others group
                if is_valid_other(protein_seq, protein_name):
                    if protein_seq not in unique_other_seqs:
                        others_data.append({
                            "phage_id": phage_id,
                            "protein_id": protein_id,
                            "Organism": organism,
                            "Host": host,
                            "ProteinName": protein_name,
                            "ProteinSeq": protein_seq,
                            "DNASeq": dna_seq,
                            "RecordDate": record_date,
                        })
                        unique_other_seqs.add(protein_seq)

# Write RBP data to CSV
fieldnames = ["phage_id", "protein_id", "Organism", "Host", "ProteinName", "ProteinSeq", "DNASeq", "RecordDate"]

with open(rbp_csv, "w", newline="") as rbp_file:
    writer = csv.DictWriter(rbp_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rbp_data)

# Write Others data to CSV
with open(others_csv, "w", newline="") as others_file:
    writer = csv.DictWriter(others_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(others_data)

# Summary
print(f"RBP data written to {rbp_csv}. Total unique RBP records: {len(rbp_data)}")
print(f"Others data written to {others_csv}. Total unique Others records: {len(others_data)}")

RBP data written to 2025_data/annotated_RBPs_2025-01.csv. Total unique RBP records: 11435
Others data written to 2025_data/annotated_nonRBPs_2025-01.csv. Total unique Others records: 405976
