# FINDING WORST PERFORMERS

In [None]:
from google.colab import drive
import pandas as pd
import os

# Mount Google Drive
drive.mount("/content/drive")

# Define path to CSV file
base_path = "/content/drive/My Drive/WebKnoGraph/data"
file_path = os.path.join(base_path, "url_analysis_results.csv")


def get_worst_candidates(file_path, folder_depth_level, n_worst):
    df = pd.read_csv(file_path)
    required_cols = {"URL", "Folder_Depth", "PageRank"}
    if not required_cols.issubset(df.columns):
        raise ValueError(f"Missing columns: {required_cols - set(df.columns)}")
    filtered_df = df[df["Folder_Depth"] == folder_depth_level]
    sorted_df = filtered_df.sort_values(by="PageRank", ascending=True)
    worst_candidates = sorted_df.head(n_worst)
    return worst_candidates[["URL"]]


# Inputs
folder_depth_input = int(input("Enter folder depth level (integer): "))
n_worst_input = int(input("Enter number of worst candidates to retrieve: "))

# Run
worst_pages = get_worst_candidates(file_path, folder_depth_input, n_worst_input)
for url in worst_pages["URL"]:
    print(url)

# FINDING BEST PERFORMERS

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount("/content/drive")

# Define path to CSV file
base_path = "/content/drive/My Drive/WebKnoGraph/data"
file_path = os.path.join(base_path, "url_analysis_results.csv")


def get_top_performers(file_path, folder_depth_level, n_top):
    """
    Retrieves the top N performing URLs (based on PageRank) for a specific folder depth level.

    Args:
        file_path (str): The path to the CSV file containing URL analysis results.
        folder_depth_level (int): The desired folder depth level to filter by.
        n_top (int): The number of top performing URLs to retrieve.

    Returns:
        pandas.DataFrame: A DataFrame containing the 'URL' column of the top performers.
                          Returns an empty DataFrame if no matching data or missing columns.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return pd.DataFrame()  # Return empty DataFrame on error

    required_cols = {"URL", "Folder_Depth", "PageRank"}
    if not required_cols.issubset(df.columns):
        raise ValueError(
            f"Missing one or more required columns: {required_cols - set(df.columns)}"
        )

    # Filter by folder depth
    filtered_df = df[df["Folder_Depth"] == folder_depth_level]

    if filtered_df.empty:
        print(f"No data found for folder depth level {folder_depth_level}.")
        return pd.DataFrame()

    # Sort by 'PageRank' in descending order (highest PageRank = top performer)
    sorted_df = filtered_df.sort_values(by="PageRank", ascending=False)

    # Get the top N performers
    top_performers = sorted_df.head(n_top)

    return top_performers[["URL"]]


# --- User Inputs ---
try:
    folder_depth_input = int(input("Enter folder depth level (integer): "))
    n_top_input = int(input("Enter number of top performers to retrieve: "))
except ValueError:
    print("Invalid input. Please enter integers for depth and count.")
    exit()  # Exit if inputs are not valid integers

# --- Run Analysis ---
print(
    f"\nRetrieving top {n_top_input} performers for folder depth {folder_depth_input}:"
)
top_pages = get_top_performers(file_path, folder_depth_input, n_top_input)

if not top_pages.empty:
    for url in top_pages["URL"]:
        print(url)
else:
    print("No top performers found based on the provided criteria.")

# FINDING RANDOM URLS

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount("/content/drive")

# Define path to CSV file
base_path = "/content/drive/My Drive/WebKnoGraph/data"
file_path = os.path.join(base_path, "url_analysis_results.csv")


def get_random_urls(file_path, n_random):
    """
    Retrieves a random sample of N URLs from the entire dataset, regardless of folder depth.

    Args:
        file_path (str): The path to the CSV file containing URL analysis results.
        n_random (int): The number of random URLs to retrieve.

    Returns:
        pandas.DataFrame: A DataFrame containing the 'URL' column of the random URLs.
                          Returns an empty DataFrame if no data or missing columns.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return pd.DataFrame()  # Return empty DataFrame on error

    required_cols = {"URL"}  # Only 'URL' is strictly required for this function
    if not required_cols.issubset(df.columns):
        raise ValueError(
            f"Missing one or more required columns: {required_cols - set(df.columns)}"
        )

    if df.empty:
        print("The CSV file is empty.")
        return pd.DataFrame()

    # Ensure n_random does not exceed the number of available URLs
    if n_random > len(df):
        print(
            f"Warning: Requested {n_random} random URLs, but only {len(df)} are available. Returning all available URLs."
        )
        n_random = len(df)

    # Get a random sample of URLs
    random_urls = df.sample(
        n=n_random, random_state=None
    )  # random_state=None ensures true randomness each run

    return random_urls[["URL"]]


# --- User Inputs ---
try:
    n_random_input = int(input("Enter number of random URLs to retrieve: "))
except ValueError:
    print("Invalid input. Please enter an integer for the count.")
    exit()  # Exit if input is not a valid integer

# --- Run Analysis ---
print(f"\nRetrieving {n_random_input} random URLs:")
random_pages = get_random_urls(file_path, n_random_input)

if not random_pages.empty:
    for url in random_pages["URL"]:
        print(url)
else:
    print("No random URLs found based on the provided criteria.")

# FILTERING URLS BY FOLDER/SUBPATH

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount("/content/drive")

# Define path to CSV file
base_path = "/content/drive/My Drive/WebKnoGraph/data"
file_path = os.path.join(base_path, "url_analysis_results.csv")


def get_best_pagerank_urls_in_folder(file_path, n_best, url_subpath_filter=None):
    """
    Retrieves the top N best performing URLs (based on PageRank) from a specific folder/URL subpath.

    Args:
        file_path (str): The path to the CSV file containing URL analysis results.
        n_best (int): The number of best PageRank URLs to retrieve.
        url_subpath_filter (str, optional): A URL subpath to filter by (e.g., '/learning-spaces/').
                                            Only URLs containing this subpath will be considered. Defaults to None.

    Returns:
        pandas.DataFrame: A DataFrame containing only the 'URL' column of the best PageRank URLs.
                          Returns an empty DataFrame if no data, no matching URLs, or missing columns.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return pd.DataFrame()  # Return empty DataFrame on error

    required_cols = {"URL", "PageRank"}  # PageRank is still required for sorting
    if not required_cols.issubset(df.columns):
        raise ValueError(
            f"Missing one or more required columns: {required_cols - set(df.columns)}"
        )

    if df.empty:
        print("The CSV file is empty.")
        return pd.DataFrame()

    # Apply URL subpath filter if provided
    filtered_df = df
    if url_subpath_filter:
        filtered_df = df[
            df["URL"].astype(str).str.contains(url_subpath_filter, na=False)
        ]
        if filtered_df.empty:
            print(f"No URLs found matching the subpath filter: '{url_subpath_filter}'")
            return pd.DataFrame()

    # Sort by 'PageRank' in descending order (highest PageRank = best performer)
    sorted_df = filtered_df.sort_values(by="PageRank", ascending=False)

    # Ensure n_best does not exceed the number of available URLs after filtering
    if n_best > len(sorted_df):
        print(
            f"Warning: Requested {n_best} best URLs, but only {len(sorted_df)} are available after filtering. Returning all available URLs."
        )
        n_best = len(sorted_df)

    # Get the top N best PageRank URLs, only selecting the 'URL' column
    best_pagerank_urls = sorted_df.head(n_best)[["URL"]]

    return best_pagerank_urls


# --- User Inputs ---
try:
    n_best_input = int(input("Enter number of best PageRank URLs to retrieve: "))
    # Prompt for the subpath filter
    url_subpath_input = input(
        "Enter URL subpath to filter by (e.g., /learning-spaces/, leave empty for no filter): "
    )
    # Set to None if user leaves it empty
    if not url_subpath_input.strip():
        url_subpath_input = None
except ValueError:
    print("Invalid input. Please enter an integer for the count.")
    exit()  # Exit if input is not a valid integer

# --- Run Analysis ---
if url_subpath_input:
    print(
        f"\nRetrieving {n_best_input} best PageRank URLs from subpath '{url_subpath_input}':"
    )
else:
    print(f"\nRetrieving {n_best_input} best PageRank URLs (no subpath filter):")

best_pages = get_best_pagerank_urls_in_folder(
    file_path, n_best_input, url_subpath_input
)

if not best_pages.empty:
    for url in best_pages["URL"]:  # Iterate through the 'URL' column of the DataFrame
        print(url)
else:
    print("No best PageRank URLs found based on the provided criteria.")

# FINDING BEST PERFORMERS ON FOLDER DEPTH INTERVAL

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount("/content/drive")

# Define path to CSV file
base_path = "/content/drive/My Drive/WebKnoGraph/data"
file_path = os.path.join(base_path, "url_analysis_results.csv")


def get_overall_top_performers_in_range(file_path, min_depth, max_depth, n_total_top):
    """
    Retrieves the overall top N performing URLs (based on PageRank) within a defined range of folder depth levels.

    Args:
        file_path (str): The path to the CSV file containing URL analysis results.
        min_depth (int): The minimum folder depth level in the range (inclusive).
        max_depth (int): The maximum folder depth level in the range (inclusive).
        n_total_top (int): The total number of top performing URLs to retrieve across the entire range.

    Returns:
        pandas.DataFrame: A DataFrame containing the 'URL' column of the overall top performers.
                          Returns an empty DataFrame if no matching data or missing columns.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return pd.DataFrame()  # Return empty DataFrame on error

    required_cols = {"URL", "Folder_Depth", "PageRank"}
    if not required_cols.issubset(df.columns):
        raise ValueError(
            f"Missing one or more required columns: {required_cols - set(df.columns)}"
        )

    # Filter by the defined folder depth range
    filtered_df = df[
        (df["Folder_Depth"] >= min_depth) & (df["Folder_Depth"] <= max_depth)
    ]

    if filtered_df.empty:
        print(f"No data found for folder depths between {min_depth} and {max_depth}.")
        return pd.DataFrame()

    # Sort the entire filtered DataFrame by 'PageRank' in descending order
    sorted_df = filtered_df.sort_values(by="PageRank", ascending=False)

    # Get the overall top N performers
    overall_top_performers = sorted_df.head(n_total_top)

    return overall_top_performers[["URL"]]


# --- User Inputs ---
try:
    min_depth_input = int(input("Enter minimum folder depth level (integer): "))
    max_depth_input = int(input("Enter maximum folder depth level (integer): "))
    n_total_top_input = int(input("Enter total number of top performers to retrieve: "))

    if min_depth_input > max_depth_input:
        print("Error: Minimum depth cannot be greater than maximum depth.")
        exit()

except ValueError:
    print("Invalid input. Please enter integers for depth range and count.")
    exit()  # Exit if inputs are not valid integers

# --- Run Analysis ---
print(
    f"\nRetrieving the overall top {n_total_top_input} performers from folder depths {min_depth_input} to {max_depth_input}:"
)
overall_top_pages = get_overall_top_performers_in_range(
    file_path, min_depth_input, max_depth_input, n_total_top_input
)

if not overall_top_pages.empty:
    for url in overall_top_pages["URL"]:
        print(url)
else:
    print("No overall top performers found based on the provided criteria.")