In [1]:
!pip install -q torch torch-geometric pandas duckdb pyarrow networkx gradio -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m436.2 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [12]:
import gradio as gr
import pandas as pd
import random
import time
import os
import tempfile  # Import tempfile for creating temporary files
import sys  # Import sys to modify Python path
import abc  # Import abc for abstract base classes
from urllib.parse import urlparse  # Added for URLProcessor's concrete implementation
import logging  # Added for URLProcessor's concrete implementation

# --- Google Colab Drive Mounting ---
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

# Add the project root to the Python path
project_root = "/content/drive/My Drive/WebKnoGraph"
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# --- Import Real Classes from WebKnoGraph Project ---
# This block will now strictly import your actual classes.
# If any of these imports fail, the script will stop.
import torch
import json
from src.backend.config.link_prediction_config import LinkPredictionConfig
from src.backend.models.graph_models import GraphSAGEModel
from src.shared.interfaces import (
    ILogger as OriginalILogger,
)  # Alias to avoid name conflict


# Define a concrete ConsoleLogger that implements OriginalILogger
class ConsoleLogger(OriginalILogger):
    def info(self, message: str):
        print(f"INFO: {message}")

    def error(self, message: str):
        print(f"ERROR: {message}")

    def debug(self, message: str):
        print(f"DEBUG: {message}")

    def warning(self, message: str):
        print(f"WARNING: {message}")

    def exception(self, message: str):
        print(f"EXCEPTION: {message}")


# Use this concrete logger as the ILogger for the application
ILogger = ConsoleLogger

# Import the real URLProcessor
from src.backend.utils.url_processing import URLProcessor


# --- Real RecommendationEngine Class (as provided by user) ---
class RecommendationEngine:
    """Loads trained artifacts and provides link recommendations using a Top-K strategy."""

    def __init__(
        self, config: LinkPredictionConfig, logger: ILogger, url_processor: URLProcessor
    ):
        self.config = config
        self.logger = logger
        self.url_processor = url_processor
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = None
        self.node_embeddings = None
        self.url_to_idx = None
        self.idx_to_url = None
        self.existing_edges = None

    def load_artifacts(self):
        """Loads the trained model, embeddings, and mappings into memory."""
        if self.model is not None:
            self.logger.info("Artifacts already loaded.")
            return True

        self.logger.info("Loading trained artifacts for recommendations...")
        try:
            # Ensure the directory exists before attempting to open files
            model_dir = os.path.dirname(self.config.node_mapping_path)
            if not os.path.exists(model_dir):
                raise FileNotFoundError(f"Model directory not found: {model_dir}")

            with open(self.config.node_mapping_path, "r") as f:
                model_metadata = json.load(f)

            self.url_to_idx = model_metadata["url_to_idx"]
            in_channels = model_metadata["in_channels"]
            hidden_channels = model_metadata["hidden_channels"]
            out_channels = model_metadata["out_channels"]

            self.idx_to_url = {v: k for k, v in self.url_to_idx.items()}

            self.node_embeddings = torch.load(
                self.config.node_embeddings_path, map_location=self.device
            ).to(self.device)
            edge_index = torch.load(
                self.config.edge_index_path, map_location=self.device
            )
            self.existing_edges = set(
                zip(edge_index[0].tolist(), edge_index[1].tolist())
            )

            self.model = GraphSAGEModel(in_channels, hidden_channels, out_channels)
            self.model.load_state_dict(
                torch.load(self.config.model_state_path, map_location=self.device)
            )
            self.model.to(self.device)
            self.model.eval()

            self.logger.info("Artifacts loaded successfully.")
            return True
        except FileNotFoundError as fnf_e:
            self.logger.error(
                f"Could not find trained model artifacts. Please run the training pipeline first. Error: {fnf_e}"
            )
            print(
                f"DEBUG: FileNotFoundError during artifact loading: {fnf_e}"
            )  # Added for debugging
            return False
        except Exception as e:
            self.logger.error(f"An error occurred while loading artifacts: {e}")
            print(
                f"DEBUG: General Exception during artifact loading: {e}"
            )  # Added for debugging
            # Re-raise for debugging if needed, but for Gradio, returning False is often better
            # raise
            return False

    def get_recommendations(
        self,
        source_url: str,
        top_n: int = 20,
        min_folder_depth: int = 0,
        max_folder_depth: int = 10,
        folder_path_filter: str = None,
    ):
        # The load_artifacts call is crucial here. If it returns False, we return None.
        if not self.load_artifacts():
            return (
                None,
                "Error: Trained model artifacts not found. Please run the training pipeline first.",
            )
        if source_url not in self.url_to_idx:
            return (
                None,
                f"Error: Source URL '{source_url}' not found in the graph's training data.",
            )

        source_idx = self.url_to_idx[source_url]
        num_nodes = len(self.url_to_idx)

        # 1. Generate scores for all possible links from the source node
        candidate_dest_indices = torch.arange(num_nodes, device=self.device)
        candidate_source_indices = torch.full_like(
            candidate_dest_indices, fill_value=source_idx
        )
        candidate_edge_index = torch.stack(
            [candidate_source_indices, candidate_dest_indices]
        )

        with torch.no_grad():
            scores = self.model.predict_link(self.node_embeddings, candidate_edge_index)

        # 2. Create a DataFrame from all possible candidates
        all_candidates_df = pd.DataFrame(
            {
                "DEST_IDX": candidate_dest_indices.cpu().numpy(),
                "SCORE": torch.sigmoid(scores).cpu().numpy(),
            }
        )

        # 3. Add URL and FOLDER_DEPTH columns
        # Use .get() with a default value to handle missing keys and prevent KeyError
        all_candidates_df["RECOMMENDED_URL"] = all_candidates_df["DEST_IDX"].apply(
            lambda idx: self.idx_to_url.get(idx, None)
        )

        # Drop rows with invalid URLs (where index was not found in mapping)
        all_candidates_df.dropna(subset=["RECOMMENDED_URL"], inplace=True)

        all_candidates_df["FOLDER_DEPTH"] = all_candidates_df["RECOMMENDED_URL"].apply(
            lambda url: self.url_processor.get_folder_depth(url)
        )

        # 4. Filter the DataFrame based on all criteria
        filtered_df = all_candidates_df.copy()

        # Filter out self-links
        filtered_df = filtered_df[filtered_df["DEST_IDX"] != source_idx]

        # Filter out existing links
        # Create a tuple column for easy set membership check
        filtered_df["SOURCE_IDX"] = source_idx
        filtered_df["EDGE_TUPLE"] = list(
            zip(filtered_df["SOURCE_IDX"], filtered_df["DEST_IDX"])
        )
        filtered_df = filtered_df[~filtered_df["EDGE_TUPLE"].isin(self.existing_edges)]

        # Apply the folder depth filter
        filtered_df = filtered_df[
            (filtered_df["FOLDER_DEPTH"] >= min_folder_depth)
            & (filtered_df["FOLDER_DEPTH"] <= max_folder_depth)
        ]

        # Apply the folder path filter if provided
        if folder_path_filter:
            self.logger.info(f"Applying folder path filter: {folder_path_filter}")
            filtered_df = filtered_df[
                filtered_df["RECOMMENDED_URL"].str.startswith(folder_path_filter)
            ]

        # 5. Sort the filtered DataFrame by score and take the top N
        final_recommendations_df = filtered_df.sort_values(
            by="SCORE", ascending=False
        ).head(top_n)

        # 6. Select the final columns and return
        final_recommendations_df = final_recommendations_df[
            ["RECOMMENDED_URL", "SCORE", "FOLDER_DEPTH"]
        ]

        if final_recommendations_df.empty:
            return (
                pd.DataFrame(),  # Return empty DataFrame for consistency
                "No recommendations found matching the criteria (filters, existing links, etc.). Try adjusting filters or source URL.",
            )

        return final_recommendations_df, None


# --- Gradio Application Logic ---

# Instantiate real classes
logger = ILogger()  # Now instantiates the concrete ConsoleLogger
url_processor = URLProcessor()
config = LinkPredictionConfig()  # This will now use the updated paths
recommendation_engine = RecommendationEngine(config, logger, url_processor)


def process_csv_for_recommendations(csv_file, min_depth: int, max_depth: int):
    """
    Gradio function to process the uploaded CSV and generate recommendations.
    Returns the DataFrame for display and the path to the saved CSV for download.
    """
    # Define default empty DataFrame and file path for error cases
    empty_df = pd.DataFrame(
        [["", "", "", "", "", "Please upload a CSV file."]],
        columns=[
            "NEW_FROM",
            "NEW_FROM_DEPTH",
            "NEW_TO",
            "NEW_TO_DEPTH",
            "Candidate Score",
            "Status",
        ],
    )
    empty_file_path = None

    if csv_file is None:
        return empty_df, empty_file_path

    try:
        df_input = pd.read_csv(csv_file.name)

        # Validate required columns
        required_cols = ["NEW_FROM", "NEW_FROM_DEPTH", "NEW_TO", "NEW_TO_DEPTH"]
        if not all(col in df_input.columns for col in required_cols):
            missing_cols = [col for col in required_cols if col not in df_input.columns]
            return pd.DataFrame(
                [
                    [
                        "",
                        "",
                        "",
                        "",
                        "",
                        f"Error: Missing columns: {', '.join(missing_cols)}.",
                    ]
                ],
                columns=[
                    "NEW_FROM",
                    "NEW_FROM_DEPTH",
                    "NEW_TO",
                    "NEW_TO_DEPTH",
                    "Candidate Score",
                    "Status",
                ],
            ), empty_file_path

        results = []
        for index, row in df_input.iterrows():
            # Original values from the input CSV
            original_new_from = row["NEW_FROM"]
            original_new_from_depth = row[
                "NEW_FROM_DEPTH"
            ]  # Original depth of NEW_FROM
            original_new_to = row[
                "NEW_TO"
            ]  # This is the URL for which we need recommendations
            new_to_depth_value = row[
                "NEW_TO_DEPTH"
            ]  # This is the depth value, not the URL

            # Use the URL from the 'NEW_TO' column for recommendation
            source_url_for_recommendation = original_new_to

            # Simulate processing time (can be removed for real processing)
            # time.sleep(0.1)

            candidate_url = None
            candidate_score = None
            new_from_candidate_depth = None  # Initialize new depth
            status = ""

            # Attempt to get recommendations from the real engine, passing depth filters
            recommendations_df, error_msg = recommendation_engine.get_recommendations(
                source_url=source_url_for_recommendation,
                top_n=50,
                min_folder_depth=min_depth,  # Pass min depth from UI
                max_folder_depth=max_depth,  # Pass max depth from UI
            )

            if error_msg:
                status = f"Error: {error_msg}"
                candidate_url = pd.NA  # Explicitly set to pandas Not Available
                candidate_score = pd.NA  # Explicitly set to pandas Not Available
                new_from_candidate_depth = pd.NA
            elif recommendations_df.empty:
                status = "No recommendations found by model"
                candidate_url = pd.NA
                candidate_score = pd.NA
                new_from_candidate_depth = pd.NA
            else:
                # Weighted random selection from model recommendations
                # Ensure scores are positive for weighting to avoid division by zero or issues with negative weights
                recommendations_df["SCORE"] = recommendations_df["SCORE"].apply(
                    lambda x: max(x, 0.001)
                )

                total_score = recommendations_df["SCORE"].sum()

                if total_score > 0:
                    # Sample one row based on 'SCORE' as weights
                    selected_row = recommendations_df.sample(
                        n=1, weights="SCORE", random_state=None
                    )
                    candidate_url = selected_row["RECOMMENDED_URL"].iloc[0]
                    candidate_score = selected_row["SCORE"].iloc[0]

                    # Calculate depth for the newly recommended URL
                    # This uses the real URLProcessor
                    new_from_candidate_depth = url_processor.get_folder_depth(
                        candidate_url
                    )

                    status = "Success (weighted random)"
                else:
                    # If total score is zero (e.g., all scores were 0.001 after max(x, 0.001))
                    status = "No valid scores for weighted random selection"
                    candidate_url = pd.NA
                    candidate_score = pd.NA
                    new_from_candidate_depth = pd.NA

            results.append(
                {
                    "NEW_FROM": candidate_url,  # This will be the new candidate URL
                    "NEW_FROM_DEPTH": new_from_candidate_depth,  # New: Depth of the recommended URL
                    "NEW_TO": original_new_to,
                    "NEW_TO_DEPTH": new_to_depth_value,  # Keep the original depth value here
                    "Candidate Score": candidate_score,
                    "Status": status,
                }
            )

        df_output = pd.DataFrame(results)

        # --- Save DataFrame to a temporary CSV file ---
        # Create a temporary file
        temp_file = tempfile.NamedTemporaryFile(
            mode="w", delete=False, suffix=".csv", encoding="utf-8"
        )
        temp_file_path = temp_file.name
        temp_file.close()  # Close the file handle so pandas can write to it

        # Save the DataFrame to the temporary file
        df_output.to_csv(temp_file_path, index=False)

        # Return both the DataFrame for display and the path to the saved file
        return df_output, temp_file_path

    except Exception as e:
        # Return a DataFrame with correct headers for unexpected errors and no file
        return pd.DataFrame(
            [["", "", "", "", "", f"An unexpected error occurred: {e}"]],
            columns=[
                "NEW_FROM",
                "NEW_FROM_DEPTH",
                "NEW_TO",
                "NEW_TO_DEPTH",
                "Candidate Score",
                "Status",
            ],
        ), empty_file_path


# --- Gradio Interface Definition ---
with gr.Blocks(title="WebKnoGraph Link Recommender") as demo:
    gr.Markdown(
        """
        # WebKnoGraph Link Recommender
        Upload a CSV file with columns: `NEW_FROM`, `NEW_FROM_DEPTH`, `NEW_TO`, `NEW_TO_DEPTH`.
        The system will suggest a new candidate URL for each `NEW_TO` URL,
        storing the result in the `NEW_FROM` column.
        The selection uses a weighted random approach from the model's top recommendations.
        """
    )

    with gr.Row():
        csv_input = gr.File(
            label="Upload CSV File", type="filepath", file_types=[".csv"]
        )
        submit_button = gr.Button("Generate Recommendations")

    with gr.Row():
        min_depth_input = gr.Number(label="Minimum Folder Depth", value=0, precision=0)
        max_depth_input = gr.Number(label="Maximum Folder Depth", value=10, precision=0)

    output_dataframe = gr.DataFrame(
        headers=[
            "NEW_FROM",
            "NEW_FROM_DEPTH",
            "NEW_TO",
            "NEW_TO_DEPTH",
            "Candidate Score",
            "Status",
        ],
        row_count=0,  # Gradio will dynamically adjust row count
        col_count=6,  # Explicitly set column count
        wrap=True,
        interactive=False,
        label="Recommendation Results",
    )

    # New Gradio File component for output download
    download_csv_file = gr.File(
        label="Download Results CSV",
        type="filepath",
        file_types=[".csv"],
        visible=False,
    )

    submit_button.click(
        fn=process_csv_for_recommendations,
        inputs=[csv_input, min_depth_input, max_depth_input],  # Pass new depth inputs
        outputs=[
            output_dataframe,
            download_csv_file,
        ],  # Now outputs both DataFrame and file
        api_name="process_csv",
    ).then(
        # Make the download button visible only after processing is complete and a file path is returned
        lambda x: gr.File(
            visible=True, value=x
        ),  # x will be the file path from the previous function
        inputs=download_csv_file,  # Use the output from the previous step as input here
        outputs=download_csv_file,
    )

if __name__ == "__main__":
    demo.launch()

INFO: Artifacts already loaded.
INFO: Artifacts already loaded.
INFO: Artifacts already loaded.
INFO: Artifacts already loaded.
INFO: Artifacts already loaded.
INFO: Artifacts already loaded.
INFO: Artifacts already loaded.
INFO: Artifacts already loaded.
INFO: Artifacts already loaded.
INFO: Artifacts already loaded.
INFO: Artifacts already loaded.
INFO: Artifacts already loaded.
INFO: Artifacts already loaded.
INFO: Artifacts already loaded.
INFO: Artifacts already loaded.
INFO: Artifacts already loaded.
Mounted at /content/drive
INFO: Loading trained artifacts for recommendations...
INFO: Artifacts loaded successfully.
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

INFO: Artifacts already loaded.
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
INFO: Artifacts alr

INFO: Artifacts already loaded.
