In [2]:
import tkinter as tk
from tkinter import filedialog, messagebox, Toplevel, ttk
import pandas as pd
import numpy as np
import os
import subprocess
import threading
import matplotlib.pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2Tk
import webbrowser
import chardet
import logomaker
from Bio import AlignIO
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import sys

if getattr(sys, 'frozen', False):
    base_path = sys._MEIPASS  # PyInstaller extracts bundled files here.
else:
    base_path = os.path.abspath(".")

# Build relative paths to executables
clustalo_exe_path = os.path.join(base_path, "clustal", "clustalo.exe")
makeblastdb_path = os.path.join(base_path, "blast", "makeblastdb.exe")
blastp_path = os.path.join(base_path, "blast", "blastp.exe")

print("Base path:", base_path)
print("clustalo_exe_path:", clustalo_exe_path)
print("blastp_path:", blastp_path)

########################
# GLOBAL SETTINGS
########################
# We add "distance_threshold" so user can specify a dendrogram cut if manual_clusters is off
settings = {
    "blast_db": "your_db",
    "blast_task": "blastp-short",   
    "output_format": "6",
    "evalue_threshold": "1e-3",
    "max_target_seqs": "10",
    "word_size": "3",
    "gap_open": "11",
    "gap_extend": "1",
    "scoring_matrix": "BLOSUM62",
    "clustering_method": "average",  # 'average','complete','single'
    "distance_metric": "euclidean",
    "num_threads": "4",
    "output_dir": "./results/",
    "show_labels": True,

    "num_clusters": 6,            # used if manual_clusters == True
    "manual_clusters": False,     # if True => use 'maxclust' approach
    "distance_threshold": "3.0",  # if manual_clusters == False => do 'distance' cut

    "clustalo_path": clustalo_exe_path,
    "makeblastdb_path": makeblastdb_path,
    "blastp_path": blastp_path,

    # Column settings
    "sequence_column": "CDRH3",
    "clone_column": "CLONE"
}

os.makedirs(settings["output_dir"], exist_ok=True)

###############################
# LOGO & ALIGNMENT FUNCTIONS
###############################
def save_sequences_to_fasta(sequences, filename):
    with open(filename, "w") as file:
        for idx, seq in enumerate(sequences):
            file.write(f">seq{idx}\n{seq}\n")

def run_clustal_omega(fasta_file, output_file):
    if os.path.exists(output_file):
        os.remove(output_file)
    cmd = [settings["clustalo_path"], "-i", fasta_file, "-o", output_file, "--auto", "--verbose"]
    try:
        subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, creationflags=subprocess.CREATE_NO_WINDOW)
        return output_file
    except subprocess.CalledProcessError as e:
        print(f"Error running Clustal Omega: {e}\n{e.stdout.decode()}\n{e.stderr.decode()}")
        return None

def create_frequency_matrix_from_alignment(alignment_file):
    alignment = AlignIO.read(alignment_file, "fasta")
    max_length = alignment.get_alignment_length()
    amino_acids = list("ACDEFGHIKLMNPQRSTVWY")

    freq = pd.DataFrame(0, index=range(max_length), columns=amino_acids)
    for record in alignment:
        for i, aa in enumerate(record.seq):
            if aa in amino_acids:
                freq.at[i, aa] += 1
    freq = freq.div(freq.sum(axis=1), axis=0).fillna(0)
    return freq

###############################
# DISPLAY FUNCTIONS
###############################
def display_logos(df_with_sequences, k, prefix="sequence"):
    """
    Show cluster-level logos from df_with_sequences, which must have 'Cluster','Name','Sequence'.
    We assume clusters are 1..k from fcluster.
    """
    logos_window = Toplevel(root)
    logos_window.title("Sequence Logos and Clone Lists")

    canvas = tk.Canvas(logos_window, width=900, height=600)
    scrollbar = ttk.Scrollbar(logos_window, orient="vertical", command=canvas.yview)
    scroll_frame = ttk.Frame(canvas)

    scroll_frame.bind("<Configure>", lambda e: canvas.configure(scrollregion=canvas.bbox("all")))
    canvas.create_window((0,0), window=scroll_frame, anchor="nw")
    canvas.configure(yscrollcommand=scrollbar.set)

    def save_logo(fig):
        path = filedialog.asksaveasfilename(defaultextension=".png", filetypes=[("PNG files","*.png")])
        if path:
            fig.savefig(path)
            messagebox.showinfo("Saved", f"Logo saved to: {path}")

    # fcluster => clusters labeled from 1..k
    for cluster_id in range(1, k+1):
        cluster_data = df_with_sequences[df_with_sequences['Cluster'] == cluster_id]
        if cluster_data.empty:
            continue

        cframe = ttk.LabelFrame(scroll_frame, text=f"Cluster {cluster_id}")
        cframe.pack(fill="both", expand=True, padx=5, pady=5)

        ttk.Label(cframe, text="Clones (Name | Sequence):").pack(anchor="w", padx=5, pady=2)
        clones_list = tk.Listbox(cframe, height=5, width=80)
        for _, row in cluster_data.iterrows():
            info = f"{row['Name']} | {row['Sequence']}"
            clones_list.insert(tk.END, info)
        clones_list.pack(fill="x", padx=5, pady=2)

        sequences = cluster_data['Sequence'].astype(str).dropna().tolist()
        valid_seq = [s.strip() for s in sequences if s.strip()]

        if len(valid_seq) == 1:
            single = valid_seq[0]
            amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
            length = len(single)
            freq = pd.DataFrame(0, index=range(length), columns=amino_acids)
            for i, aa in enumerate(single):
                if aa in amino_acids:
                    freq.at[i, aa] = 1
            freq = freq.fillna(0)
        else:
            fasta_file = os.path.join(settings["output_dir"], f"{prefix}_cluster_{cluster_id}_seqs.fasta")
            aligned_file = os.path.join(settings["output_dir"], f"{prefix}_cluster_{cluster_id}_aligned.fasta")
            save_sequences_to_fasta(valid_seq, fasta_file)
            aligned_file = run_clustal_omega(fasta_file, aligned_file)
            if aligned_file is None:
                continue
            freq = create_frequency_matrix_from_alignment(aligned_file)
            if freq.empty:
                continue

        fig, ax = plt.subplots(figsize=(8,4))
        logomaker.Logo(freq, ax=ax)
        ax.set_title(f"Cluster {cluster_id} Logo", fontsize=14)

        fig_canv = FigureCanvasTkAgg(fig, master=cframe)
        fig_canv.draw()
        fig_canv.get_tk_widget().pack(fill="both", expand=True, padx=5, pady=5)

        save_btn = tk.Button(cframe, text="Save Logo", command=lambda f=fig: save_logo(f))
        save_btn.pack(pady=5)

    canvas.pack(side="left", fill="both", expand=True)
    scrollbar.pack(side="right", fill="y")

def display_dendrogram(Z, evalue_log):
    """
    Display the dendrogram from a precomputed Z (linkage).
    evalue_log is used for row labels => evalue_log.index
    """
    dendro_win = Toplevel(root)
    dendro_win.title("Dendrogram Visualization")

    fig, ax = plt.subplots(figsize=(12,6))
    dendrogram(Z, labels=evalue_log.index, leaf_rotation=90, ax=ax)
    ax.set_title("Dendrogram of Log-Transformed E-values", fontsize=16)
    ax.set_xlabel("Clones", fontsize=14)
    ax.set_ylabel("Distance", fontsize=14)
    fig.tight_layout()

    canv = FigureCanvasTkAgg(fig, master=dendro_win)
    canv.draw()
    canv.get_tk_widget().pack(fill=tk.BOTH, expand=True)

    toolbar = NavigationToolbar2Tk(canv, dendro_win)
    toolbar.update()
    canv.get_tk_widget().pack(fill=tk.BOTH, expand=True)

    def save_dendro():
        path = filedialog.asksaveasfilename(defaultextension=".png", filetypes=[("PNG files","*.png")])
        if path:
            fig.savefig(path)
            messagebox.showinfo("Saved", f"Dendrogram saved to: {path}")
    tk.Button(dendro_win, text="Save Dendrogram", command=save_dendro).pack(pady=5)

########################
# PIPELINE
########################
def process_pipeline():
    full_seq_files = [f.strip() for f in entry_files.get().split(",") if f.strip()]
    if not full_seq_files:
        messagebox.showerror("Error", "Please select at least one file.")
        return None

    all_sequences = []
    for file in full_seq_files:
        ext = os.path.splitext(file)[1].lower()
        try:
            if ext in [".xls",".xlsx",".xlsm"]:
                df = pd.read_excel(file)
            else:
                with open(file,'rb') as f:
                    raw_data = f.read(10000)
                guess = chardet.detect(raw_data)
                enc = guess['encoding'] if guess['encoding'] else "latin-1"
                df = pd.read_csv(file, encoding=enc)
        except Exception as e:
            messagebox.showerror("Error", f"Could not read file {file}:\n{str(e)}")
            return None
        
        df.columns = [c.strip().upper() for c in df.columns]
        seq_col = settings["sequence_column"].upper()
        name_col = settings["clone_column"].upper()

        if seq_col not in df.columns or name_col not in df.columns:
            messagebox.showerror("Error", f"Invalid file format in {file}. Must have '{name_col}' and '{seq_col}'.")
            return None

        df[seq_col] = df[seq_col].astype(str).str.strip().str.upper()
        df[name_col] = df[name_col].astype(str).str.strip().str.upper()

        for n, s in zip(df[name_col], df[seq_col]):
            if s.strip():
                all_sequences.append((n.strip(), s.strip()))

    if not all_sequences:
        messagebox.showerror("Error", "No valid sequences found.")
        return None

    # Write FASTA
    fasta_path = os.path.join(settings["output_dir"], "sequences.fasta")
    with open(fasta_path, "w") as ff:
        for (n, s) in all_sequences:
            ff.write(f">{n}\n{s}\n")
    text_output.insert(tk.END, "FASTA file created successfully.\n")

    # makeblastdb
    try:
        cmd = [
            settings["makeblastdb_path"], "-in", fasta_path, "-dbtype", "prot",
            "-out", settings["blast_db"]
        ]
        res = subprocess.run(cmd, capture_output=True, text=True, check=True)
        text_output.insert(tk.END, f"makeblastdb output:\n{res.stdout}\n")
    except subprocess.CalledProcessError as e:
        messagebox.showerror("Error", f"makeblastdb failed:\n{e.stderr}")
        return None

    # blastp
    blast_cmd = [settings["blastp_path"]]
    if settings["blast_task"] == "blastp-short":
        blast_cmd.extend(["-task", "blastp-short"])
    blast_cmd.extend([
        "-query", fasta_path,
        "-db", settings["blast_db"],
        "-outfmt", settings["output_format"],
        "-out", os.path.join(settings["output_dir"], "results.txt")
    ])
    subprocess.run(blast_cmd, check=True)
    text_output.insert(tk.END, f"blastp ({settings['blast_task']}) completed.\n")

    # read results
    blast_out = os.path.join(settings["output_dir"], "results.txt")
    if not os.path.exists(blast_out) or os.stat(blast_out).st_size == 0:
        messagebox.showerror("Error", "No BLAST hits found. Aborting clustering.")
        return None

    df_blast = pd.read_csv(blast_out, sep='\t', header=None)
    df_blast.columns = [
        'query','subject','percent_identity','alignment_length','mismatches',
        'gap_opens','q_start','q_end','s_start','s_end','evalue','bit_score'
    ]
    df_blast['query']   = df_blast['query'].astype(str).str.strip().str.upper()
    df_blast['subject'] = df_blast['subject'].astype(str).str.strip().str.upper()

    df_sim = df_blast[['query','subject','evalue']].drop_duplicates()
    sim_mat = df_sim.pivot(index='query', columns='subject', values='evalue')
    sim_mat = sim_mat.replace(0, float(settings["evalue_threshold"])).fillna(float(settings["evalue_threshold"]))

    # evalue_log
    evalue_log = -np.log10(sim_mat)
    evalue_log.index   = evalue_log.index.str.strip().str.upper()
    evalue_log.columns = evalue_log.columns.str.strip().str.upper()

    # unify
    df_seq = pd.DataFrame(all_sequences, columns=['Name','Sequence'])
    df_seq['Name'] = df_seq['Name'].str.strip().str.upper()
    df_seq['Sequence'] = df_seq['Sequence'].str.strip().str.upper()

    filtered_evalue_log = evalue_log.loc[evalue_log.index.intersection(df_seq['Name'])]
    if filtered_evalue_log.shape[0] < 2:
        messagebox.showerror("Error", f"Not enough sequences for clustering. Found only {filtered_evalue_log.shape[0]} sample(s).")
        return None

    # Linkage
    Z = linkage(filtered_evalue_log, method=settings["clustering_method"])

    # Decide how to cluster => distance-based or maxclust
    from scipy.cluster.hierarchy import fcluster

    if settings["manual_clusters"]:
        # user wants a fixed # of clusters
        k = int(settings["num_clusters"])
        text_output.insert(tk.END, f"Using user-specified k={k}\n")
        cluster_labels = fcluster(Z, t=k, criterion='maxclust')
    else:
        # user wants distance-based cut
        distance_threshold = float(settings["distance_threshold"])
        text_output.insert(tk.END, f"Using distance-based cut at {distance_threshold}\n")
        cluster_labels = fcluster(Z, t=distance_threshold, criterion='distance')

    # Build final df
    df_clusters = pd.DataFrame({
        'query': filtered_evalue_log.index,
        'Cluster': cluster_labels
    })

    # If it's distance-based, figure out how many clusters we ended up with:
    unique_clusters = len(np.unique(cluster_labels))
    text_output.insert(tk.END, f"Formed {unique_clusters} cluster(s).\n")

    df_with_sequences = df_seq.merge(df_clusters, left_on='Name', right_on='query', how='right')

    return evalue_log, df_with_sequences, unique_clusters, Z

########################
# Button Handlers
########################
def generate_logos():
    progress.start()
    result = process_pipeline()
    if result is None:
        progress.stop()
        return
    evalue_log, df_with_sequences, k, Z = result
    display_logos(df_with_sequences, k, prefix="sequence")
    progress.stop()

def generate_dendrogram():
    progress.start()
    result = process_pipeline()
    if result is None:
        progress.stop()
        return
    evalue_log, df_with_sequences, k, Z = result
    display_dendrogram(Z, evalue_log)
    progress.stop()

########################
# GUI Utility
########################
def browse_files():
    filenames = filedialog.askopenfilenames(
        filetypes=[
            ("CSV files","*.csv"),
            ("Excel files","*.xls"),
            ("Excel files","*.xlsx"),
            ("Excel files","*.xlsm"),
            ("All Files","*.*")
        ]
    )
    if filenames:
        entry_files.delete(0, tk.END)
        entry_files.insert(0, ", ".join(filenames))

def clear_output():
    text_output.delete('1.0', tk.END)

def show_data():
    fns = [x.strip() for x in entry_files.get().split(",") if x.strip()]
    if not fns:
        messagebox.showerror("Error", "No file selected.")
        return
    data_window = Toplevel(root)
    data_window.title("Data Preview")
    tree = ttk.Treeview(data_window)
    try:
        file = fns[0]
        ext = os.path.splitext(file)[1].lower()
        if ext in [".xls",".xlsx",".xlsm"]:
            df = pd.read_excel(file)
        else:
            with open(file,'rb') as f:
                raw_data = f.read(10000)
            guess = chardet.detect(raw_data)
            enc = guess['encoding'] if guess['encoding'] else "latin-1"
            df = pd.read_csv(file, encoding=enc)
        df.columns = [c.strip().upper() for c in df.columns]

        seq_col = settings["sequence_column"].upper()
        name_col = settings["clone_column"].upper()
        if seq_col not in df.columns or name_col not in df.columns:
            messagebox.showerror("Error", f"File {file} missing {seq_col} or {name_col} columns.")
            return

        tree["columns"] = list(df.columns)
        tree["show"] = "headings"
        for c in df.columns:
            tree.heading(c, text=c)
            tree.column(c, width=150)
        for _, row in df.iterrows():
            tree.insert("", "end", values=list(row))
        tree.pack(fill="both", expand=True)
    except Exception as e:
        messagebox.showerror("Error", f"Could not read file:\n{str(e)}")

########################
# SETTINGS WINDOW
########################
def open_settings():
    settings_window = Toplevel(root)
    settings_window.title("Pipeline Settings")

    canvas = tk.Canvas(settings_window, width=500, height=800)
    scrollbar = ttk.Scrollbar(settings_window, orient="vertical", command=canvas.yview)
    scrollable_frame = ttk.Frame(canvas)

    scrollable_frame.bind("<Configure>", lambda e: canvas.configure(scrollregion=canvas.bbox("all")))
    canvas.create_window((0,0), window=scrollable_frame, anchor="nw")
    canvas.configure(yscrollcommand=scrollbar.set)

    canvas.create_window((0, 0), window=scrollable_frame, anchor="nw")

    # Bind mouse wheel scrolling to the canvas (for Windows)
    def _on_mousewheel(event):
        canvas.yview_scroll(-1 * (event.delta // 120), "units")
    canvas.bind_all("<MouseWheel>", _on_mousewheel)

    row = 0
    tk.Label(scrollable_frame, text="BLAST Database Name:").grid(row=row, column=0, padx=5, pady=5, sticky="w")
    db_entry = tk.Entry(scrollable_frame)
    db_entry.grid(row=row, column=1, padx=5, pady=5)
    db_entry.insert(0, settings["blast_db"])
    row += 1
    tk.Label(scrollable_frame, text="The database used for BLAST alignment.", fg="gray").grid(row=row, column=0, columnspan=2, sticky="w")
    row += 1

    tk.Label(scrollable_frame, text="BLAST Output Format:").grid(row=row, column=0, padx=5, pady=5, sticky="w")
    outfmt_entry = tk.Entry(scrollable_frame)
    outfmt_entry.grid(row=row, column=1, padx=5, pady=5)
    outfmt_entry.insert(0, str(settings["output_format"]))
    row += 1
    tk.Label(scrollable_frame, text="Format for BLAST output (6=tabular).", fg="gray").grid(row=row, column=0, columnspan=2, sticky="w")
    row += 1

    tk.Label(scrollable_frame, text="BLAST e-value:").grid(row=row, column=0, padx=5, pady=5, sticky="w")
    evalue_entry = tk.Entry(scrollable_frame)
    evalue_entry.grid(row=row, column=1, padx=5, pady=5)
    evalue_entry.insert(0, str(settings["evalue_threshold"]))
    row += 1
    tk.Label(scrollable_frame, text="Lower => stricter matches.", fg="gray").grid(row=row, column=0, columnspan=2, sticky="w")
    row += 1

    tk.Label(scrollable_frame, text="Max Target Sequences:").grid(row=row, column=0, padx=5, pady=5, sticky="w")
    max_target_entry = tk.Entry(scrollable_frame)
    max_target_entry.grid(row=row, column=1, padx=5, pady=5)
    max_target_entry.insert(0, str(settings["max_target_seqs"]))
    row += 1
    tk.Label(scrollable_frame, text="Max results per query in BLAST.", fg="gray").grid(row=row, column=0, columnspan=2, sticky="w")
    row += 1

    tk.Label(scrollable_frame, text="BLAST Word Size:").grid(row=row, column=0, padx=5, pady=5, sticky="w")
    word_size_entry = tk.Entry(scrollable_frame)
    word_size_entry.grid(row=row, column=1, padx=5, pady=5)
    word_size_entry.insert(0, str(settings["word_size"]))
    row += 1
    tk.Label(scrollable_frame, text="Larger => faster, less sensitive.", fg="gray").grid(row=row, column=0, columnspan=2, sticky="w")
    row += 1

    blast_task_frame = ttk.Frame(scrollable_frame)
    blast_task_frame.grid(row=row, column=0, columnspan=2, pady=5, sticky="w")
    use_blastp_short_var = tk.BooleanVar(value=(settings["blast_task"]=="blastp-short"))
    blast_task_checkbox = tk.Checkbutton(blast_task_frame, text="Use blastp-short mode", variable=use_blastp_short_var)
    blast_task_checkbox.pack(side="left")
    blast_task_note = tk.Label(blast_task_frame, text="(For queries <30aa)", fg="gray")
    blast_task_note.pack(side="left", padx=5)
    row += 1

    tk.Label(scrollable_frame, text="Sequence Column:").grid(row=row, column=0, padx=5, pady=5, sticky="w")
    seq_col_entry = tk.Entry(scrollable_frame)
    seq_col_entry.grid(row=row, column=1, padx=5, pady=5)
    seq_col_entry.insert(0, settings.get("sequence_column", "CDRH3"))
    row += 1

    tk.Label(scrollable_frame, text="Clone Column:").grid(row=row, column=0, padx=5, pady=5, sticky="w")
    clone_col_entry = tk.Entry(scrollable_frame)
    clone_col_entry.grid(row=row, column=1, padx=5, pady=5)
    clone_col_entry.insert(0, settings.get("clone_column", "CLONE"))
    row += 1

    tk.Label(scrollable_frame, text="Clustering Method:").grid(row=row, column=0, padx=5, pady=5, sticky="w")
    cluster_method_entry = ttk.Combobox(scrollable_frame, values=["average","complete","single"])
    cluster_method_entry.grid(row=row, column=1, padx=5, pady=5)
    cluster_method_entry.set(settings["clustering_method"])
    row += 1
    tk.Label(scrollable_frame, text="E.g. average, complete, single, etc.", fg="gray").grid(row=row, column=0, columnspan=2, sticky="w")
    row += 1

    tk.Label(scrollable_frame, text="Distance Metric:").grid(row=row, column=0, padx=5, pady=5, sticky="w")
    distance_metric_entry = ttk.Combobox(scrollable_frame, values=["euclidean","manhattan","cosine","hamming"])
    distance_metric_entry.grid(row=row, column=1, padx=5, pady=5)
    distance_metric_entry.set(settings["distance_metric"])
    row += 1
    tk.Label(scrollable_frame, text="Usually not used if NxN log data is already distance-like.", fg="gray").grid(row=row, column=0, columnspan=2, sticky="w")
    row += 1

    tk.Label(scrollable_frame, text="Number of Threads:").grid(row=row, column=0, padx=5, pady=5, sticky="w")
    num_threads_entry = tk.Entry(scrollable_frame)
    num_threads_entry.grid(row=row, column=1, padx=5, pady=5)
    num_threads_entry.insert(0, str(settings["num_threads"]))
    row += 1
    tk.Label(scrollable_frame, text="For BLAST on multi-core CPUs.", fg="gray").grid(row=row, column=0, columnspan=2, sticky="w")
    row += 1

    tk.Label(scrollable_frame, text="Output Directory:").grid(row=row, column=0, padx=5, pady=5, sticky="w")
    output_dir_entry = tk.Entry(scrollable_frame)
    output_dir_entry.grid(row=row, column=1, padx=5, pady=5)
    output_dir_entry.insert(0, settings["output_dir"])
    row += 1
    tk.Label(scrollable_frame, text="Folder where results will be saved.", fg="gray").grid(row=row, column=0, columnspan=2, sticky="w")
    row += 1

    manual_clusters_var = tk.BooleanVar(value=settings["manual_clusters"])
    tk.Checkbutton(scrollable_frame, text="Use manual cluster count (maxclust)", variable=manual_clusters_var).grid(row=row, column=0, columnspan=2, pady=5)
    row += 1

    tk.Label(scrollable_frame, text="Number of Clusters:").grid(row=row, column=0, padx=5, pady=5, sticky="w")
    num_clusters_entry = tk.Entry(scrollable_frame)
    num_clusters_entry.grid(row=row, column=1, padx=5, pady=5)
    num_clusters_entry.insert(0, str(settings.get("num_clusters", 6)))
    row += 1

    tk.Label(scrollable_frame, text="Distance Threshold (if not using manual clusters):").grid(row=row, column=0, padx=5, pady=5, sticky="w")
    distance_threshold_entry = tk.Entry(scrollable_frame)
    distance_threshold_entry.grid(row=row, column=1, padx=5, pady=5)
    distance_threshold_entry.insert(0, str(settings.get("distance_threshold","3.0")))
    row += 1
    tk.Label(scrollable_frame, text="Cut the dendrogram at this distance => #clusters auto", fg="gray").grid(row=row, column=0, columnspan=2, sticky="w")
    row += 1

    show_labels_var = tk.BooleanVar(value=settings["show_labels"])
    tk.Checkbutton(scrollable_frame, text="Show Labels in Dendrogram", variable=show_labels_var).grid(row=row, column=0, columnspan=2, pady=5)
    row += 1

    def open_blast_link(event):
        webbrowser.open("https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs")
    blast_link = tk.Label(scrollable_frame, text="More about BLAST", fg="blue", cursor="hand2")
    blast_link.grid(row=row, column=0, columnspan=2, pady=5)
    blast_link.bind("<Button-1>", open_blast_link)
    row += 1

    def save_settings():
        settings["blast_db"] = db_entry.get()
        settings["output_format"] = outfmt_entry.get()
        settings["evalue_threshold"] = evalue_entry.get()
        settings["max_target_seqs"] = max_target_entry.get()
        settings["word_size"] = word_size_entry.get()
        if use_blastp_short_var.get():
            settings["blast_task"] = "blastp-short"
        else:
            settings["blast_task"] = "blastp"

        settings["sequence_column"] = seq_col_entry.get().strip().upper()
        settings["clone_column"] = clone_col_entry.get().strip().upper()
        settings["clustering_method"] = cluster_method_entry.get()
        settings["distance_metric"] = distance_metric_entry.get()
        settings["num_threads"] = num_threads_entry.get()
        settings["output_dir"] = output_dir_entry.get()

        settings["manual_clusters"] = manual_clusters_var.get()
        settings["num_clusters"] = int(num_clusters_entry.get())
        settings["distance_threshold"] = distance_threshold_entry.get()
        settings["show_labels"] = show_labels_var.get()

        settings_window.destroy()
        messagebox.showinfo("Settings Saved", "New settings have been applied.")

    tk.Button(scrollable_frame, text="Save Settings", command=save_settings, bg="blue", fg="white").grid(row=row, column=0, columnspan=2, pady=10)

    canvas.pack(side="left", fill="both", expand=True)
    scrollbar.pack(side="right", fill="y")

    settings_window.geometry("500x850")

########################
# MAIN GUI
########################
root = tk.Tk()
root.title("Automated Sequence Analysis Tool")

menu_bar = tk.Menu(root)
file_menu = tk.Menu(menu_bar, tearoff=0)
file_menu.add_command(label="Settings", command=open_settings)
menu_bar.add_cascade(label="File", menu=file_menu)

edit_menu = tk.Menu(menu_bar, tearoff=0)
edit_menu.add_command(label="Clear Output", command=clear_output)
menu_bar.add_cascade(label="Edit", menu=edit_menu)
root.config(menu=menu_bar)

tk.Label(root, text="Sequence Analysis Pipeline", font=("Arial",14,"bold")).pack(pady=5)
tk.Label(root, text=(
    "Instructions:\n"
    "1. Select one or more CSV/Excel files containing your sequence data.\n"
    "   (Ensure columns for clone name and sequence exist, see 'Settings').\n"
    "2. Click 'Browse' to select file(s).\n"
    "3. Click 'Show Data' to preview.\n"
    "4. Click 'Generate Logos' => runs pipeline + shows logos.\n"
    "5. Click 'Generate Dendrogram' => shows hierarchical merges.\n"
    "6. In 'Settings', pick either a manual cluster count (maxclust) or a distance cut."
), font=("Arial",10), justify=tk.LEFT).pack(pady=5)

entry_files = tk.Entry(root, width=80)
entry_files.pack(padx=5)

tk.Button(root, text="Browse", command=browse_files, width=20).pack(pady=5)
tk.Button(root, text="Show Data", command=show_data, bg="blue", fg="white", width=20).pack(pady=5)
tk.Button(root, text="Generate Logos", command=generate_logos, bg="green", fg="white", width=20).pack(pady=5)
tk.Button(root, text="Generate Dendrogram", command=generate_dendrogram, bg="orange", fg="white", width=20).pack(pady=5)

progress = ttk.Progressbar(root, mode="indeterminate")
progress.pack(pady=5, fill=tk.X)

text_output = tk.Text(root, height=15, width=100)
text_output.pack(padx=10, pady=10)

root.mainloop()


Base path: C:\Users\franc\Desktop\MastersYear3-Thesis\UserInterface
clustalo_exe_path: C:\Users\franc\Desktop\MastersYear3-Thesis\UserInterface\clustal\clustalo.exe
blastp_path: C:\Users\franc\Desktop\MastersYear3-Thesis\UserInterface\blast\blastp.exe
