In [0]:
from graphframes import GraphFrame

In [0]:
graphs = spark.table('graphs')
display(graphs)

index,from_id,to_id,collection_date
4,pid_0,pid_1,2003-03-02
5,pid_0,pid_2,2003-03-02
6,pid_0,pid_3,2003-03-02
7,pid_0,pid_4,2003-03-02
8,pid_0,pid_5,2003-03-02
9,pid_1,pid_0,2003-03-02
10,pid_1,pid_2,2003-03-02
11,pid_1,pid_4,2003-03-02
12,pid_1,pid_5,2003-03-02
13,pid_1,pid_15,2003-03-02


In [0]:
products = spark.table('products')
display(products)

product_id,asin,title,salesrank,categories,discontinued,group_id
pid_0,asin_0771044445,,,,1.0,
pid_1,asin_0827229534,Patterns of Preaching: A Sermon Sampler,396585.0,2.0,,gid_283155
pid_2,asin_0738700797,Candlemas: Feast of Flames,168596.0,2.0,,gid_283155
pid_3,asin_0486287785,World War II Allied Fighter Planes Trading Cards,1270652.0,1.0,,gid_283155
pid_4,asin_0842328327,Life Application Bible Commentary: 1 and 2 Timothy and Titus,631289.0,5.0,,gid_283155
pid_5,asin_1577943082,Prayers That Avail Much for Business: Executive,455160.0,2.0,,gid_283155
pid_6,asin_0486220125,How the Other Half Lives: Studies Among the Tenements of New York,188784.0,5.0,,gid_283155
pid_7,asin_B00000AU3R,Batik,5392.0,3.0,,gid_5174
pid_8,asin_0231118597,Losing Matt Shepard,277409.0,4.0,,gid_283155
pid_9,asin_1859677800,Making Bread: The Taste of Traditional Home-Baking,949166.0,1.0,,gid_283155


# Query 4

Consider a user is visiting amazon.com and is looking at books. They look at each of the 20 best selling and 20 worst selling books and from then on is only using the links under “Customers who bought this item also bought” list to view other items, and after a certain number of clicks is back to the page for the original book. If we call this a cyclic path, how many such paths exist where the path length is 5 for each of the 20 best and worst selling books?

In [0]:
filter_query = '''
    SELECT gd.from_id AS src, gd.to_id AS dst
    FROM graphs gd
    JOIN products p1 ON gd.from_id = p1.product_id
    JOIN products p2 ON gd.to_id = p2.product_id
'''

filtered_graph_data = spark.sql(filter_query)
filtered_graph_data.show()

+-----+------+
|  src|   dst|
+-----+------+
|pid_0| pid_1|
|pid_0| pid_2|
|pid_0| pid_3|
|pid_0| pid_4|
|pid_0| pid_5|
|pid_1| pid_0|
|pid_1| pid_2|
|pid_1| pid_4|
|pid_1| pid_5|
|pid_1|pid_15|
|pid_2| pid_0|
|pid_2|pid_11|
|pid_2|pid_12|
|pid_2|pid_13|
|pid_2|pid_14|
|pid_3|pid_63|
|pid_3|pid_64|
|pid_3|pid_65|
|pid_3|pid_66|
|pid_3|pid_67|
+-----+------+
only showing top 20 rows



In [0]:
product = products.select("product_id","title").withColumnRenamed("product_id", "id")

In [0]:
# Create a GraphFrame from the filtered graph data
graph = GraphFrame(product, filtered_graph_data)

display(graph)



GraphFrame(v:[id: string, title: string], e:[src: string, dst: string])

In [0]:
from pyspark.sql.functions import expr, col

### Top 20 books (best and worst)

In [0]:
from pyspark.sql.functions import asc, desc

book_query = '''
SELECT products.product_id AS id, products.title, products.salesrank
FROM products 
JOIN groups
ON products.group_id = groups.group_id
WHERE groups.group_name = 'Book'
'''
books = spark.sql(book_query)
books = books.withColumnRenamed("product_id", "id")
best_rated = books.filter('salesrank >= 1').orderBy(asc("salesrank")).head(20)
worst_rated = books.orderBy(desc("salesrank")).head(20)

display(best_rated)
display(worst_rated)

id,title,salesrank
pid_296,The Da Vinci Code,19
pid_390452,Sisterhood of the Traveling Pants (Sisterhood of Traveling Pants),21
pid_89000,The Tipping Point: How Little Things Can Make a Big Difference,23
pid_337971,The Secret Life of Bees,26
pid_154855,Good to Great: Why Some Companies Make the Leap... and Others Don't,29
pid_376858,Angels & Demons,31
pid_312527,The Purpose-Driven Life: What on Earth Am I Here For?,32
pid_162283,"Rich Dad, Poor Dad: What the Rich Teach Their Kids About Money--That the Poor and Middle Class Do Not!",37
pid_11638,"The South Beach Diet: The Delicious, Doctor-Designed, Foolproof Plan for Fast and Healthy Weight Loss",38
pid_62424,Life of Pi,42


id,title,salesrank
pid_392331,Planes (A Unicorn Paperback),3798351
pid_293111,The Efl/Esl Job Search Handbook: Included Is a Step-By-Step Approach to the Efl/Esl Job Search,3796990
pid_408579,Algebraic Groups and Modular Lie Algebras,3795269
pid_4568,Favourite Nights and Caught on a Train (Methuen New Theatrescript),3793303
pid_242292,How to Measure Angles from Foot Radiographs,3788732
pid_45743,Organizations As Systems,3781483
pid_396871,Recent Developments in Gauge Theories (Nato Science Series: B:),3779983
pid_450097,Fish and Fisheries of India,3779920
pid_446568,Arabic: Phonology and Script,3773956
pid_402839,Interims,3769185


In [0]:
cyclic_paths_best = {}
sales_rank = {}
for product_row in best_rated:
    product_id = product_row["id"]
    product_title = product_row["title"]
    sales_rank[product_title] = product_row["salesrank"]
    
    cyclic_paths_count = graph.find("(a)-[e1]->(b); (b)-[e2]->(c); (c)-[e3]->(d); (d)-[e4]->(e);(e)-[e5]->(a)").filter(col("a.id") == product_id).count()
    
    cyclic_paths_best[product_title] = cyclic_paths_count

for product_title, count in cyclic_paths_best.items():
    print("Book Name:", product_title)
    print("Sales Rank:", sales_rank[product_title])
    print("Number of cyclic paths of length 5:", count)
    print()



Book Name: The Da Vinci Code
Sales Rank: 19
Number of cyclic paths of length 5: 14273

Book Name: Sisterhood of the Traveling Pants (Sisterhood of Traveling Pants)
Sales Rank: 21
Number of cyclic paths of length 5: 934

Book Name: The Tipping Point: How Little Things Can Make a Big Difference
Sales Rank: 23
Number of cyclic paths of length 5: 33750

Book Name: The Secret Life of Bees
Sales Rank: 26
Number of cyclic paths of length 5: 1437

Book Name: Good to Great: Why Some Companies Make the Leap... and Others Don't
Sales Rank: 29
Number of cyclic paths of length 5: 9933

Book Name: Angels & Demons
Sales Rank: 31
Number of cyclic paths of length 5: 7286

Book Name: The Purpose-Driven Life: What on Earth Am I Here For?
Sales Rank: 32
Number of cyclic paths of length 5: 1378

Book Name: Rich Dad, Poor Dad: What the Rich Teach Their Kids About Money--That the Poor and Middle Class Do Not!
Sales Rank: 37
Number of cyclic paths of length 5: 27033

Book Name: The South Beach Diet: The Delic

In [0]:
cyclic_paths_worst = {}
sales_rank = {}
for product_row in worst_rated:
    product_id = product_row["id"]
    product_title = product_row["title"]
    sales_rank[product_title] = product_row["salesrank"]

    cyclic_paths_count = graph.find("(a)-[e1]->(b); (b)-[e2]->(c); (c)-[e3]->(d); (d)-[e4]->(e);(e)-[e5]->(a)").filter(col("a.id") == product_id).count()
    
    cyclic_paths_worst[product_title] = cyclic_paths_count

# Step 5: Print the number of cyclic paths for each product
for product_title, count in cyclic_paths_worst.items():
    print("Book Name:", product_title)
    print("Sales Rank:", sales_rank[product_title])
    print("Number of cyclic paths of length 5:", count)
    print()

Book Name: Planes (A Unicorn Paperback)
Sales Rank: 3798351
Number of cyclic paths of length 5: 2354

Book Name: The Efl/Esl Job Search Handbook: Included Is a Step-By-Step Approach to the Efl/Esl Job Search
Sales Rank: 3796990
Number of cyclic paths of length 5: 8609

Book Name: Algebraic Groups and Modular Lie Algebras
Sales Rank: 3795269
Number of cyclic paths of length 5: 0

Book Name: Favourite Nights and Caught on a Train (Methuen New Theatrescript)
Sales Rank: 3793303
Number of cyclic paths of length 5: 13313

Book Name: How to Measure Angles from Foot Radiographs
Sales Rank: 3788732
Number of cyclic paths of length 5: 2212

Book Name: Organizations As Systems
Sales Rank: 3781483
Number of cyclic paths of length 5: 36025

Book Name: Recent Developments in Gauge Theories (Nato Science Series: B:)
Sales Rank: 3779983
Number of cyclic paths of length 5: 14796

Book Name: Fish and Fisheries of India
Sales Rank: 3779920
Number of cyclic paths of length 5: 0

Book Name: Arabic: Phonol