In [7]:
import tkinter as tk
from tkinter import filedialog, ttk, messagebox
import csv
import concurrent.futures
import requests
import re
import json
import time
import random
import logging
import tkinter.font as tkfont

logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')

##############################################################################
# PART 1: DNA → Protein Translation (via Expasy)
##############################################################################
def random_user_agent():
    """Returns a random user-agent string."""
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; rv:96.0) Gecko/20100101 Firefox/96.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
    ]
    return random.choice(user_agents)

def translate_sequence(dna_seq):
    """
    Sends a DNA sequence to Expasy's CGI tool and returns the FASTA text.
    """
    url = 'https://web.expasy.org/cgi-bin/translate/dna2aa.cgi'
    data = {'dna_sequence': dna_seq, 'output_format': 'fasta'}
    headers = {"User-Agent": random_user_agent()}
    try:
        response = requests.post(url, data=data, headers=headers, timeout=10)
        if response.status_code == 200:
            return response.text
    except Exception as e:
        logging.error("Error contacting Expasy: %s", e)
    return None

def parse_fasta(fasta_text):
    """Parses FASTA formatted text into a list of (header, sequence) tuples."""
    records = []
    entries = re.split(r'(?m)^>', fasta_text.strip())
    for entry in entries:
        if not entry.strip():
            continue
        lines = entry.splitlines()
        header = lines[0].strip()
        sequence = "".join(line.strip() for line in lines[1:])
        records.append((header, sequence))
    return records

def select_best_frame(fasta_text):
    """
    Returns the protein sequence from the reading frame that has the fewest dashes
    and occurrences of "Stop".
    """
    records = parse_fasta(fasta_text)
    if not records:
        return ""
    def count_markers(seq):
        return seq.count('-') + len(re.findall(r"Stop", seq))
    best = min(records, key=lambda rec: count_markers(rec[1]))
    return best[1]

def translate_dna_to_protein(dna_seq):
    """
    Convenience function that translates a DNA sequence and returns the protein sequence.
    """
    fasta_text = translate_sequence(dna_seq)
    if fasta_text:
        protein = select_best_frame(fasta_text)
        return protein
    return ""

##############################################################################
# PART 2: Protein Annotation (via NovoProLabs)
##############################################################################
def get_cdr_annotation(sequence, numbering_scheme="kabat", definition_scheme="kabat"):
    """
    Sends the protein sequence to NovoProLabs for CDR annotation.
    Returns the JSON response.
    """
    url = "https://www.novoprolabs.com/plus/ppc.php"
    payload = {"sr": "cdr", "nskm": numbering_scheme, "dskm": definition_scheme, "sq": sequence}
    headers = {
        "User-Agent": random_user_agent(),
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://www.novoprolabs.com/"
    }
    response = requests.post(url, data=payload, headers=headers, timeout=10)
    response.raise_for_status()
    try:
        return response.json()
    except Exception as e:
        raise ValueError("Could not parse JSON response") from e

def extract_annotations(json_data):
    annotations = {}
    try:
        brackets = json_data[1]
        for bracket in brackets:
            for segment in bracket:
                if isinstance(segment, dict):
                    for key, value in segment.items():
                        if isinstance(value, list) and len(value) >= 2:
                            annotations[key] = value[1]
                        else:
                            logging.error("Unexpected format for key %s: %s", key, value)
                else:
                    logging.error("Segment is not a dictionary: %s", segment)
    except Exception as e:
        logging.error("Error extracting annotations: %s", e)
    return annotations

def extract_chain_sequences(annotations):
    expected_order_heavy = ["FR-H1", "CDR-H1", "FR-H2", "CDR-H2", "FR-H3", "CDR-H3", "FR-H4"]
    expected_order_light = ["FR-L1", "CDR-L1", "FR-L2", "CDR-L2", "FR-L3", "CDR-L3", "FR-L4"]
    heavy_segments = [annotations.get(key, "") for key in expected_order_heavy]
    light_segments = [annotations.get(key, "") for key in expected_order_light]
    heavy_chain_seq = "".join(heavy_segments)
    light_chain_seq = "".join(light_segments)
    return heavy_chain_seq, light_chain_seq

def process_protein_sequence_with_chains(protein_seq):
    cleaned_seq = protein_seq.replace('-', '')
    try:
        json_response = get_cdr_annotation(cleaned_seq, numbering_scheme="kabat", definition_scheme="kabat")
        annotations = extract_annotations(json_response)
        heavy_seq, light_seq = extract_chain_sequences(annotations)
        annotations["heavy_chain"] = heavy_seq
        annotations["light_chain"] = light_seq
        return annotations
    except Exception as e:
        logging.error("Error processing sequence: %s", e)
        return {}

##############################################################################
# PART 2.5: SnapGene Extraction Helper
##############################################################################
def extract_snapgene_region(row):
    """
    Returns a slice of row["protein translation"] using the heavy/light chain markers.
    """
    if "protein translation" not in row or not row["protein translation"]:
        return ""
    protein_seq = row["protein translation"]
    heavy = row.get("heavy_chain", "")
    light = row.get("light_chain", "")
    start_aa = 0
    end_aa = len(protein_seq)
    if len(heavy) >= 3:
        first3 = heavy[:3]
        found_start = protein_seq.find(first3)
        if found_start != -1:
            start_aa = found_start
    if len(light) >= 3:
        last3 = light[-3:]
        found_end = protein_seq.rfind(last3)
        if found_end != -1:
            end_aa = found_end + 3
    if end_aa < start_aa:
        return ""
    return protein_seq[start_aa:end_aa]

##############################################################################
# PART 3: Tkinter GUI Application
##############################################################################
# Columns for the data table
MAIN_COLUMNS = ["clone name", "dna sequence", "protein translation", "heavy_chain", "light_chain", "snapgene_substring"]

class FullProcessingApp(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("DNA2Protein & Variable Domain Annotator")
        self.data = []
        
        title_font = tkfont.Font(family="Helvetica", size=18, weight="bold")
        lbl_title = tk.Label(self, text="DNA2Protein & Variable Domain Annotator", font=title_font)
        lbl_title.pack(pady=(10, 5))
        
        buttons_frame = tk.Frame(self, padx=10, pady=5)
        buttons_frame.pack(fill="x", padx=10)
        tk.Button(buttons_frame, text="Load CSV File", command=self.load_csv, bg="purple", fg="white", width=20).pack(side="left", padx=5)
        tk.Button(buttons_frame, text="Translate DNA to Protein", command=self.translate_data, bg="maroon", fg="white", width=20).pack(side="left", padx=5)
        tk.Button(buttons_frame, text="Annotate Variable Domains", command=self.annotate_data, bg="darkgreen", fg="white", width=20).pack(side="left", padx=5)
        tk.Button(buttons_frame, text="Export CSV", command=self.save_csv, width=20).pack(side="left", padx=5)
        tk.Button(buttons_frame, text="Reset View", command=self.clear_table, width=20).pack(side="left", padx=5)
        
        self.tree = ttk.Treeview(self)
        self.tree.pack(fill="both", expand=True, padx=10, pady=10)

        self.instruction_label = tk.Label(
            self,
            text="Double-click a row to view its CDR details, or right-click for more options."
        )

        vsb = ttk.Scrollbar(self, orient="vertical", command=self.tree.yview)
        vsb.pack(side="right", fill="y")
        self.tree.configure(yscrollcommand=vsb.set)
        hsb = ttk.Scrollbar(self, orient="horizontal", command=self.tree.xview)
        hsb.pack(side="bottom", fill="x")
        self.tree.configure(xscrollcommand=hsb.set)
        
        self.popup_menu = tk.Menu(self, tearoff=0)
        self.popup_menu.add_command(label="Copy Selected Row(s)", command=self.copy_selection_to_clipboard)
        self.popup_menu.add_command(label="View Details", command=self.show_details_popup)
        self.tree.bind("<Button-3>", self.show_context_menu)
        self.bind("<Control-c>", self.copy_selection_to_clipboard)
        self.tree.bind("<Double-1>", lambda e: self.show_details_popup())
        
        menu_bar = tk.Menu(self)
        help_menu = tk.Menu(menu_bar, tearoff=0)
        help_menu.add_command(label="Usage Instructions", command=self.show_help)
        menu_bar.add_cascade(label="Help", menu=help_menu)
        self.config(menu=menu_bar)

    def show_help(self):
        help_text = (
            "Usage Instructions:\n\n"
            "1. Load CSV File:\n"
            "   - The CSV must have columns 'clone name' and 'dna sequence'.\n"
            "   - Example:\n"
            "         clone name,dna sequence\n"
            "         Clone1,ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG\n"
            "         Clone2,ATGCGTACGTAGCTAGCTAGCTAGCTAGCTAGCTAACGTA\n\n"
            "2. Translate DNA to Protein:\n"
            "   - Uses Expasy (https://web.expasy.org/translate/) to translate DNA sequences.\n\n"
            "3. Annotate Variable Domains:\n"
            "   - Uses NovoProLabs (https://www.novoprolabs.com/tools/cdr) for CDR annotations.\n\n"
            "4. Save Output CSV:\n"
            "   - Saves the processed data including protein translations and annotations.\n"
            "5. Clear Table:\n"
            "   - Clears the current data displayed in the table.\n\n"
            "Additional Tips:\n"
            "   - After annotation, double-click a row to view its CDR details.\n"
            "   - Right-click a row for more context-menu options."
        )
        messagebox.showinfo("Help - Instructions", help_text)

    def load_csv(self):
        file_path = filedialog.askopenfilename(filetypes=[("CSV Files", "*.csv")], title="Select CSV File")
        if not file_path:
            return
        self.data = []
        with open(file_path, 'r', newline='') as infile:
            reader = csv.DictReader(infile)
            for row in reader:
                self.data.append(row)
        self.update_treeview()

    def save_csv(self):
        if not self.data:
            messagebox.showinfo("No Data", "Nothing to save.")
            return
        file_path = filedialog.asksaveasfilename(defaultextension=".csv", filetypes=[("CSV Files", "*.csv")], title="Save Output CSV")
        if not file_path:
            return
        all_keys = set()
        for row in self.data:
            all_keys.update(row.keys())
        fieldnames = sorted(list(all_keys))
        with open(file_path, 'w', newline='') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()
            for row in self.data:
                writer.writerow(row)
        messagebox.showinfo("Success", f"Output CSV saved to: {file_path}")

    def translate_data(self):
        if not self.data:
            messagebox.showinfo("No Data", "Please load a CSV file first.")
            return
        results = []
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            future_to_row = {
                executor.submit(translate_dna_to_protein, row["dna sequence"]): row
                for row in self.data if "dna sequence" in row
            }
            for future in concurrent.futures.as_completed(future_to_row):
                row = future_to_row[future]
                try:
                    protein = future.result()
                except Exception as exc:
                    logging.error("Error processing row %s: %s", row.get("clone name", "Unknown"), exc)
                    protein = "Error"
                row["protein translation"] = protein
                results.append(row)
        self.data = results
        self.update_treeview()

    def annotate_data(self):
        if not self.data:
            messagebox.showinfo("No Data", "Please load and translate a CSV first.")
            return
        missing = [row for row in self.data if "protein translation" not in row or not row["protein translation"]]
        if missing:
            messagebox.showerror("Data Error", "Some rows are missing a protein translation. Please run translation first.")
            return
        results = []
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            future_to_row = {
                executor.submit(process_protein_sequence_with_chains, row["protein translation"]): row
                for row in self.data
            }
            for future in concurrent.futures.as_completed(future_to_row):
                row = future_to_row[future]
                try:
                    annotation_dict = future.result()
                except Exception as exc:
                    logging.error("Error annotating row %s: %s", row.get("clone name", "Unknown"), exc)
                    annotation_dict = {}
                if isinstance(annotation_dict, dict):
                    for k, v in annotation_dict.items():
                        row[k] = v
                row["snapgene_substring"] = extract_snapgene_region(row)
                results.append(row)
        self.data = results
        self.update_treeview()

        if self.data:
            self.instruction_label.pack(pady=(0, 10))

    def update_treeview(self):
        self.tree.delete(*self.tree.get_children())

        if self.data:
            columns_to_show = [col for col in MAIN_COLUMNS if any(col in row for row in self.data)]
            self.tree["columns"] = columns_to_show
            self.tree["show"] = "headings"
            for col in columns_to_show:
                self.tree.heading(col, text=col)
                self.tree.column(col, width=150, stretch=True)

            for row in self.data:
                values = [row.get(col, "") for col in columns_to_show]
                self.tree.insert("", tk.END, values=values)

    def show_context_menu(self, event):
        row_id = self.tree.identify_row(event.y)
        if row_id:
            if row_id not in self.tree.selection():
                self.tree.selection_set(row_id)
            self.popup_menu.post(event.x_root, event.y_root)

    def show_details_popup(self):
        selected_items = self.tree.selection()
        if not selected_items:
            return
        for item in selected_items:
            row_values = self.tree.item(item, 'values')
            partial_dict = {}
            for i, col in enumerate(self.tree["columns"]):
                partial_dict[col] = row_values[i]
            clone_name = partial_dict.get("clone name", None)
            full_row = None
            if clone_name:
                for d in self.data:
                    if d.get("clone name") == clone_name:
                        full_row = d
                        break
            else:
                full_row = partial_dict
            if not full_row:
                full_row = partial_dict

            popup = tk.Toplevel(self)
            popup.title("Row Details")
            row_text = ""
            for k, v in sorted(full_row.items()):
                row_text += f"{k}: {v}\n"
            text_widget = tk.Text(popup, wrap="word", width=80, height=25)
            text_widget.insert("1.0", row_text)
            text_widget.config(state="disabled")
            text_widget.pack(fill="both", expand=True, padx=10, pady=10)

    def copy_selection_to_clipboard(self, event=None):
        selected_items = self.tree.selection()
        if not selected_items:
            return
        rows_data = []
        for item in selected_items:
            row_values = self.tree.item(item, 'values')
            row_text = "\t".join(str(v) for v in row_values)
            rows_data.append(row_text)
        clipboard_text = "\n".join(rows_data)
        self.clipboard_clear()
        self.clipboard_append(clipboard_text)
        self.update()

    def clear_table(self):
        self.data = []
        self.tree.delete(*self.tree.get_children())
        self.instruction_label.pack_forget()

if __name__ == "__main__":
    app = FullProcessingApp()
    app.mainloop()

2025-05-11 20:26:47,621 [ERROR] Error processing sequence: Could not parse JSON response
2025-05-11 20:27:10,691 [ERROR] Error processing sequence: Could not parse JSON response
2025-05-11 20:27:17,797 [ERROR] Error processing sequence: Could not parse JSON response
2025-05-11 20:27:20,511 [ERROR] Error processing sequence: Could not parse JSON response
2025-05-11 20:27:22,197 [ERROR] Error processing sequence: Could not parse JSON response
2025-05-11 20:27:26,108 [ERROR] Error processing sequence: Could not parse JSON response
2025-05-11 20:27:26,238 [ERROR] Error processing sequence: Could not parse JSON response
2025-05-11 20:27:30,478 [ERROR] Error processing sequence: Could not parse JSON response
2025-05-11 20:27:32,840 [ERROR] Error processing sequence: Could not parse JSON response
2025-05-11 20:27:35,776 [ERROR] Error processing sequence: HTTPSConnectionPool(host='www.novoprolabs.com', port=443): Read timed out. (read timeout=10)
2025-05-11 20:27:38,057 [ERROR] Error processin

In [6]:
import pandas as pd

# 1) read your original file
df = pd.read_csv(
    r"C:\Users\franc\Desktop\MastersYear3-Thesis\PredictiveModelAntibodyAntigen\AbAgIntPre\CoV-AbDab\positive dataset.txt",
    sep="\t",
    header=None,
    names=["clone name","cdrh3_heavy","cdrl3_light"]
)

# 2) concatenate heavy+light into one protein sequence column
df["protein translation"] = df["cdrh3_heavy"] + df["cdrl3_light"]

# 3) keep only the two columns the GUI needs
#    (we omit 'dna sequence' so that the Translate step is skipped)
out = df[["clone name","protein translation"]]

# 4) write to CSV
out.to_csv("input_proteins_pos.csv", index=False)

In [11]:
import pandas as pd
from pathlib import Path

# Adjust these to your actual CSV names / paths
INPUTS = [
    ("pos_cdrh3.csv", "train_pos_cdrh3.txt"),
    ("neg_cdrh3.csv", "train_neg_cdrh3.txt"),
]

for infile, out_txt in INPUTS:
    df = pd.read_csv(infile)

    # Determine column names (case‐insensitive match)
    cols = {c.lower(): c for c in df.columns}
    ag_col = cols.get("clone name", list(df.columns)[0])
    h3_col = cols.get("cdr-h3", cols.get("cdrh3"))
    l3_col = cols.get("cdr-l3", cols.get("cdrl3"))

    if h3_col is None or l3_col is None:
        raise ValueError(f"Could not find both CDR-H3 and CDR-L3 in {infile}")

    with open(out_txt, "w") as f:
        for _, row in df.iterrows():
            ag_id = str(row[ag_col]).strip()
            h3    = str(row[h3_col]).strip()
            l3    = str(row[l3_col]).strip()
            # only write lines where both loops are non-empty
            if h3 and l3:
                f.write(f"{ag_id}\t{h3}\t{l3}\n")

    print(f"Wrote {out_txt} with {sum(1 for _ in open(out_txt))} lines")

Wrote train_pos_cdrh3.txt with 9309 lines
Wrote train_neg_cdrh3.txt with 1710 lines
