# FINDING WORST PERFORMERS

In [None]:
from google.colab import drive
import pandas as pd
import os

# Mount Google Drive
drive.mount("/content/drive")

# Define path to CSV file
base_path = "/content/drive/My Drive/WebKnoGraph/data"
file_path = os.path.join(base_path, "url_analysis_results.csv")


def get_worst_candidates(file_path, folder_depth_level, n_worst):
    df = pd.read_csv(file_path)
    required_cols = {"URL", "Folder_Depth", "PageRank"}
    if not required_cols.issubset(df.columns):
        raise ValueError(f"Missing columns: {required_cols - set(df.columns)}")
    filtered_df = df[df["Folder_Depth"] == folder_depth_level]
    sorted_df = filtered_df.sort_values(by="PageRank", ascending=True)
    worst_candidates = sorted_df.head(n_worst)
    return worst_candidates[["URL"]]


# Inputs
folder_depth_input = int(input("Enter folder depth level (integer): "))
n_worst_input = int(input("Enter number of worst candidates to retrieve: "))

# Run
worst_pages = get_worst_candidates(file_path, folder_depth_input, n_worst_input)
for url in worst_pages["URL"]:
    print(url)

# FINDING BEST PERFORMERS

In [None]:
from google.colab import drive
import pandas as pd
import os

# Mount Google Drive
drive.mount("/content/drive")

# Define path to CSV file
base_path = "/content/drive/My Drive/WebKnoGraph/data"
file_path = os.path.join(base_path, "url_analysis_results.csv")


def get_top_performers(file_path, folder_depth_level, n_top):
    """
    Retrieves the top N performing URLs (based on PageRank) for a specific folder depth level.

    Args:
        file_path (str): The path to the CSV file containing URL analysis results.
        folder_depth_level (int): The desired folder depth level to filter by.
        n_top (int): The number of top performing URLs to retrieve.

    Returns:
        pandas.DataFrame: A DataFrame containing the 'URL' column of the top performers.
                          Returns an empty DataFrame if no matching data or missing columns.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return pd.DataFrame()  # Return empty DataFrame on error

    required_cols = {"URL", "Folder_Depth", "PageRank"}
    if not required_cols.issubset(df.columns):
        raise ValueError(
            f"Missing one or more required columns: {required_cols - set(df.columns)}"
        )

    # Filter by folder depth
    filtered_df = df[df["Folder_Depth"] == folder_depth_level]

    if filtered_df.empty:
        print(f"No data found for folder depth level {folder_depth_level}.")
        return pd.DataFrame()

    # Sort by 'PageRank' in descending order (highest PageRank = top performer)
    sorted_df = filtered_df.sort_values(by="PageRank", ascending=False)

    # Get the top N performers
    top_performers = sorted_df.head(n_top)

    return top_performers[["URL"]]


# --- User Inputs ---
try:
    folder_depth_input = int(input("Enter folder depth level (integer): "))
    n_top_input = int(input("Enter number of top performers to retrieve: "))
except ValueError:
    print("Invalid input. Please enter integers for depth and count.")
    exit()  # Exit if inputs are not valid integers

# --- Run Analysis ---
print(
    f"\nRetrieving top {n_top_input} performers for folder depth {folder_depth_input}:"
)
top_pages = get_top_performers(file_path, folder_depth_input, n_top_input)

if not top_pages.empty:
    for url in top_pages["URL"]:
        print(url)
else:
    print("No top performers found based on the provided criteria.")

# FINDING RANDOM URLS

In [None]:
from google.colab import drive
import pandas as pd
import os
import random

# Mount Google Drive
drive.mount("/content/drive")

# Define path to CSV file
base_path = "/content/drive/My Drive/WebKnoGraph/data"
file_path = os.path.join(base_path, "url_analysis_results.csv")


def get_random_urls(file_path, n_random):
    """
    Retrieves a random sample of N URLs from the entire dataset, regardless of folder depth.

    Args:
        file_path (str): The path to the CSV file containing URL analysis results.
        n_random (int): The number of random URLs to retrieve.

    Returns:
        pandas.DataFrame: A DataFrame containing the 'URL' column of the random URLs.
                          Returns an empty DataFrame if no data or missing columns.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return pd.DataFrame()  # Return empty DataFrame on error

    required_cols = {"URL"}  # Only 'URL' is strictly required for this function
    if not required_cols.issubset(df.columns):
        raise ValueError(
            f"Missing one or more required columns: {required_cols - set(df.columns)}"
        )

    if df.empty:
        print("The CSV file is empty.")
        return pd.DataFrame()

    # Ensure n_random does not exceed the number of available URLs
    if n_random > len(df):
        print(
            f"Warning: Requested {n_random} random URLs, but only {len(df)} are available. Returning all available URLs."
        )
        n_random = len(df)

    # Get a random sample of URLs
    random_urls = df.sample(
        n=n_random, random_state=None
    )  # random_state=None ensures true randomness each run

    return random_urls[["URL"]]


# --- User Inputs ---
try:
    n_random_input = int(input("Enter number of random URLs to retrieve: "))
except ValueError:
    print("Invalid input. Please enter an integer for the count.")
    exit()  # Exit if input is not a valid integer

# --- Run Analysis ---
print(f"\nRetrieving {n_random_input} random URLs:")
random_pages = get_random_urls(file_path, n_random_input)

if not random_pages.empty:
    for url in random_pages["URL"]:
        print(url)
else:
    print("No random URLs found based on the provided criteria.")