# Query 3
Find the number of levels/connections/relationships necessary to travel from one product A with a given group G1 to another product that has a different group G2. Show the chain of products and groups that were traversed until you reached a different group

In [0]:
from pyspark.sql.functions import col, rand

In [0]:
# Extracting nodes
product_df = spark.sql('SELECT * FROM products')
product_df = product_df.withColumnRenamed('product_id', 'id')

# Extracting vertices
graph_df = spark.sql('SELECT from_id, to_id FROM graphs')
graph_df = graph_df.withColumnRenamed('from_id', 'src')
graph_df = graph_df.withColumnRenamed('to_id', 'dst')

In [0]:
def select_random_product(product_df):
    """
    Select a random row from the product dataframe
    and return the product id and group id
    """
    product = product_df.orderBy(rand()).limit(1)
    product_id = product.collect()[0]['id']
    group_id = product.collect()[0]['group_id']
    return product_id, group_id

def graph_traversal(product, group, level, traversal_results):
    """
    Traverses a graph starting from a given product and group, and collects traversal results.

    Args:
        product (str): The starting product.
        group (int): The group identifier of the starting product.
        level (int): The current level of traversal.
        traversal_results (list): A list to store traversal results.

    Returns:
        None
    """

    # Append the current product, group, and level to the traversal results
    traversal_results.append((product, group, level))
    
    # If the group is not the starting group, return and stop further traversal
    if group != start_group:
        return

    # Get the next products to traverse based on the current product
    next_products = (
        graph_df.filter(f"src = '{product}'")  # Filter edges where the source is the current product
        .join(product_df, col("dst") == col("id"))  # Join with product_df on destination ID
        .filter(col("group_id") != group)  # Filter out products with the same group
        .select("id", "group_id")  # Select only the ID and group_id columns
        .collect()  # Collect the results as a list of rows
    )

    # Traverse the next products recursively
    for row in next_products:
        next_product = row["id"]
        next_group = row["group_id"]
        graph_traversal(next_product, next_group, level + 1, traversal_results)

def print_tree(results):
    max_depth = 0
    for result in results:
        p_id = result[0]
        g_id = result[1]
        depth = result[2]
        if depth > max_depth:
            max_depth = depth
        indent = "   " * depth  # Adjust the number of spaces based on your preference
        print(f"{indent}|- {p_id} ({g_id})")
    print(f"Levels traveled to reach new group: {max_depth}")


def get_depth_to_new_group_tree(prod_id, group_id):
    traversal_results = []
    graph_traversal(prod_id, group_id, 0, traversal_results)
    print_tree(traversal_results)

In [0]:
prod_id, group_id = select_random_product(product_df)
start_group = group_id
get_depth_to_new_group_tree(prod_id, group_id)

|- pid_439721 (gid_5174)
Levels traveled to reach new group: 0


In [0]:
prod_id = 'pid_63779'
group_id = 'gid_5174'
start_group = group_id
get_depth_to_new_group_tree(prod_id, group_id)

|- pid_63779 (gid_5174)
   |- pid_73189 (gid_283155)
   |- pid_56748 (gid_283155)
   |- pid_55461 (gid_283155)
   |- pid_63780 (gid_283155)
   |- pid_10969 (gid_283155)
   |- pid_32735 (gid_283155)
   |- pid_903 (gid_283155)
   |- pid_55763 (gid_283155)
   |- pid_12057 (gid_283155)
   |- pid_4206 (gid_283155)
   |- pid_902 (gid_283155)
   |- pid_26022 (gid_283155)
   |- pid_63775 (gid_283155)
   |- pid_63777 (gid_283155)
   |- pid_54198 (gid_283155)
   |- pid_106404 (gid_283155)
   |- pid_54195 (gid_283155)
   |- pid_63773 (gid_283155)
   |- pid_63778 (gid_283155)
   |- pid_63772 (gid_283155)
   |- pid_59907 (gid_283155)
   |- pid_48295 (gid_283155)
   |- pid_194609 (gid_283155)
   |- pid_63781 (gid_283155)
   |- pid_59155 (gid_283155)
   |- pid_194610 (gid_283155)
   |- pid_182293 (gid_283155)
   |- pid_182292 (gid_283155)
   |- pid_47206 (gid_283155)
Levels traveled to reach new group: 1
