In [1]:
import sys
import os

In [2]:
!pip install pyspark
!pip install findspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=f05be46c4e91486bf798387dd69a53d3dee0a164a6bd2db5019f52aafcc97279
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [10]:
!rm -rf spark-3.5.0-bin-hadoop3.tgz
!wget -q https://downloads.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

In [11]:
!ls -l spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz

-rw-r--r-- 1 root root 400395283 Sep  9 02:10 spark-3.5.0-bin-hadoop3.tgz


In [12]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

In [13]:
import findspark
findspark.init("/content/spark-3.5.0-bin-hadoop3")# SPARK_HOME

In [14]:
os.environ['KAGGLE_USERNAME'] = "doinav"
os.environ['KAGGLE_KEY'] = "0296dbba988a8d4ffdcd246d547d216a"
!kaggle datasets download -d cynthiarempel/amazon-us-customer-reviews-dataset

Downloading amazon-us-customer-reviews-dataset.zip to /content
100% 20.9G/21.0G [03:32<00:00, 163MB/s]
100% 21.0G/21.0G [03:33<00:00, 105MB/s]


In [15]:
!unzip -j /content/amazon-us-customer-reviews-dataset.zip As _v1_02.tsv

Archive:  /content/amazon-us-customer-reviews-dataset.zip
  inflating: amazon_reviews_us_Books_v1_02.tsv  


In [16]:
import pyspark
from pyspark.sql import SparkSession

import pyspark.sql.functions as f
from pyspark.sql import Window

spark = SparkSession.builder \
                    .appName("Link Analysis") \
                    .getOrCreate()
spark

PAGE RANK

In [17]:
# Import the dataset as RDD and remove the header
books = spark.sparkContext.textFile('amazon_reviews_us_Books_v1_02.tsv', minPartitions=8) # import as rdd dataset

header = books.first()
books = books.filter(lambda line: line != header)

In [18]:
# Pre-process data by splitting the columns, retrieve column 1 (customer id) and column 2 (product id), sample 5% of it
data = books.map(lambda x: (x.split('\t')[1], x.split('\t')[3])).sample(False, 0.05, 99)

In [19]:
# Group the data by costumer id -> map product ids into lists, and filter out customer ids having reviewed only one book
df = data.groupByKey().mapValues(list)
filtered = df.filter(lambda x: len(x[1]) > 1)

In [20]:
# Compute the total number of nodes (distinct book id)
tot = data.map(lambda x: x[1]).distinct().count()

In [21]:
# Compute the edges between books
def calculate_linkages(data):
    key, values = data
    combine = [(v1, v2) for i, v1 in enumerate(values) for v2 in values[i + 1:]]
    add = [(v2, v1) for (v1, v2) in combine]
    return (combine + add)

In [22]:
# Compute the list of edges
links = filtered.map(lambda x: calculate_linkages(x)).flatMap(lambda value: value)

# Calculate out-degree for each node
id2degree = links.countByKey()

# Sort the items in the defaultdict by key in descending order
sorted_items = sorted(id2degree.items(), key=lambda x: x[1], reverse=True)

In [23]:
# Create the transportation matrix and its transposed
P = links.map(lambda x: (x[0], x[1], 1 / id2degree[x[0]])) #(i, j, Mij)
PT = P.map(lambda x: (x[1], x[0], x[2])) #(j, i , Mij)

In [24]:
# Calculate the initial probability
def calculate_probability(degrees, total):
    prob = 1 / total
    p_i = {item: prob for item in degrees.keys()}
    return p_i

p_i = calculate_probability(id2degree, tot)

In [30]:
# Exploit the power method for PageRank by iteratively updating the probabilities
for i in range(50):
    new_p = PT.map(lambda x:(x[0], (x[2]*p_i[x[1]])))\
              .reduceByKey(lambda x,y: x+y)\
              .collect()\

    for idx,prb in new_p:
        p_i[idx] = prb

    print(f"iteration {i}")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49


In [26]:
# Save the results into a dictionary
d = dict(new_p)

# Sort it by product in ascending order
sorted_d = dict(sorted(d.items(), key=lambda item: item[0], reverse=False))

In [27]:
# Sort and print the most recurrent products
sorted_p_i = sorted(p_i.items(), key=lambda x: x[1], reverse=True)[:20]
print('Most quoted products:')
for item, prob in sorted_p_i:
    print(f'With prob: {prob}, you take product with code: {item}')

Most quoted products:
With prob: 0.00012314727975369468, you take product with code: 0385504209
With prob: 8.926467652436924e-05, you take product with code: 0316769487
With prob: 8.498383247184573e-05, you take product with code: 043935806X
With prob: 7.58342347673823e-05, you take product with code: 0316666343
With prob: 7.1874565024191e-05, you take product with code: 0446676098
With prob: 6.92401985330735e-05, you take product with code: 0439139597
With prob: 6.881467050514896e-05, you take product with code: 0375726403
With prob: 6.620958344884291e-05, you take product with code: 0439136350
With prob: 6.52873916655704e-05, you take product with code: 0446532231
With prob: 6.34226928276526e-05, you take product with code: 1931561648
With prob: 6.118308095938049e-05, you take product with code: 0671027360
With prob: 6.0449845549312354e-05, you take product with code: 0525947647
With prob: 6.005965346074745e-05, you take product with code: 0393317552
With prob: 5.877529775868174e-05,

In [28]:
# Define the list of values you want to filter on in column 3
values_to_filter = []

for index, (item, prob) in enumerate(sorted_p_i):
    if index > 10:
        break
    values_to_filter.append(item)

In [29]:
# Show the first 10 products by page rank and associated title
names = books.filter(lambda x: x.split('\t')[3] in values_to_filter).map(lambda x: (x.split('\t')[3], x.split('\t')[5]))

for i in values_to_filter:
  name = names.filter(lambda x: x[0] == i)
  print(name.take(1))

[('0385504209', 'The Da Vinci Code')]
[('0316769487', 'The Catcher in the Rye')]
[('043935806X', 'Harry Potter and the Order of the Phoenix (Book 5)')]
[('0316666343', 'The Lovely Bones')]
[('0446676098', 'The Notebook')]
[('0439139597', 'Harry Potter And The Goblet Of Fire (Book 4)')]
[('0375726403', 'Empire Falls')]
[('0439136350', 'Harry Potter And The Prisoner Of Azkaban')]
[('0446532231', "Dude, Where's My Country?")]
[('1931561648', "The Time Traveler's Wife")]
[('0671027360', 'Angels & Demons')]


TOPIC SENSITIVE PAGE RANK using book rankings

In [31]:
# The same dataset as before, now also containing the ratings for each reviewed product
datats = books.map(lambda x: (x.split('\t')[1], x.split('\t')[3], x.split('\t')[7])).sample(False, 0.05, 99)

In [32]:
# Group the data by costumer id and map values into a list of tuples (product_id, rating)
dfts = datats.groupBy(lambda x: x[0]).mapValues(lambda values: [(value[1], value[2]) for value in values])

In [33]:
# Filter out dead ends
filteredts = dfts.filter(lambda x: len(x[1]) > 1)

In [34]:
# From the filtered df retrieve the list of tuples (product_id, rating)
S = filteredts.flatMap(lambda x: x[1])

In [35]:
# Compute the average rating per book
# - Convert rating to int and create a count
# - Sum ratings and count for each product
# - Divide sum of ratings by count to get average
average_ratings = S \
                  .map(lambda x: (x[0], (int(x[1]), 1))) \
                  .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) \
                  .mapValues(lambda x: round(float(x[0] / x[1]), 3))

In [36]:
# Filter out nodes having a rating greater or equal to 4
S_above3 = average_ratings.filter(lambda x: float(x[1]) >= 4)

# Cardinality of the set of topic-sensitive products
S_count_above3 = S_above3.count()

In [37]:
# Compute the topic-sensitive vector: each book is assigned value 1 if the book is topic-sensitive, 0 otherwise
e_S_above3 = average_ratings\
              .map(lambda x: int(float(x[1])>= 4))\
              .collect()

In [38]:
# Compute the Topic Sensitive PR for each item
def TopicSensitivePR(d, e, beta, n):
    TSPR = {}
    for (key_pr, pagerank), element in zip(d.items(), e):
      new_value = 0
      new_value += beta * pagerank + (1 - beta) * element / n
      TSPR[key_pr] = new_value
    return TSPR

In [39]:
beta = 0.8
TSPR_above3 = TopicSensitivePR(sorted_d, e_S_above3, beta, S_count_above3)

In [40]:
#Sort and print the most recurrent products
sorted_TSPR_above3= sorted(TSPR_above3.items(), key=lambda x:x[1], reverse = True)[:10]
print('Most quoted products:')
for item, prob in sorted_TSPR_above3:
    print(f'With prob: {prob}, you take product with code: {item}')

Most quoted products:
With prob: 0.00010541961136593456, you take product with code: 0385504209
With prob: 7.831352878247421e-05, you take product with code: 0316769487
With prob: 6.798706597747659e-05, you take product with code: 043935806X
With prob: 6.756917537688465e-05, you take product with code: 0316666343
With prob: 6.440143958233162e-05, you take product with code: 0446676098
With prob: 6.195352396709798e-05, you take product with code: 0375726403
With prob: 5.986945432205314e-05, you take product with code: 0439136350
With prob: 5.9131700895435136e-05, you take product with code: 0446532231
With prob: 5.7639941825100895e-05, you take product with code: 1931561648
With prob: 5.539215882645881e-05, you take product with code: 0439139597


In [41]:
# Define the list of values you want to filter on in column 3
values_to_filter = []
for index, (item, prob) in enumerate(sorted_TSPR_above3):
    if index > 10:
        break
    values_to_filter.append(item)

In [42]:
# Join the books titles and ratings on book code
names_TSPR_above3 = books.filter(lambda x: x.split('\t')[3] in values_to_filter).map(lambda x: (x.split('\t')[3], x.split('\t')[5], x.split('\t')[7])).distinct()
join_above3 = names_TSPR_above3.join(average_ratings)

In [43]:
# Show the resulting RDD, containing book code, title and average rating
for i in values_to_filter:
  name = join_above3.filter(lambda x: x[0] == i)
  print(name.take(1))

[('0385504209', ('The Da Vinci Code', 3.562))]
[('0316769487', ('The Catcher in the Rye', 4.32))]
[('043935806X', ('Harry Potter and the Order of the Phoenix (Book 5)', 4.324))]
[('0316666343', ('The Lovely Bones', 3.692))]
[('0446676098', ('The Notebook', 3.944))]
[('0375726403', ('Empire Falls', 4.364))]
[('0439136350', ('Harry Potter And The Prisoner Of Azkaban', 4.875))]
[('0446532231', ("Dude, Where's My Country?", 4.118))]
[('1931561648', ("The Time Traveler's Wife", 4.444))]
[('0439139597', ('Harry Potter And The Goblet Of Fire (Book 4)', 4.6))]


In [44]:
# Filter out nodes having a rating larger than 4
S_less3 = average_ratings.filter(lambda x: int(x[1]) < 4)

# Cardinality of the set of topic-sensitive products
S_count_less3 = S_less3.count()

In [45]:
# Compute the topic-sensitive vector: each book is assigned value 1 if the book is topic-sensitive, 0 otherwise
e_S_less3 = average_ratings\
              .map(lambda x: int(int(x[1])< 4))\
              .collect()

In [46]:
TSPR_less3 = TopicSensitivePR(d, e_S_less3, beta, S_count_less3)

In [47]:
#Sort and print the most recurrent products
sorted_TSPR_less3= sorted(TSPR_less3.items(), key=lambda x:x[1], reverse = True)[:10]
print('Most quoted products:')
for item, prob in sorted_TSPR_less3:
    print(f'With prob: {prob}, you take product with code: {item}')

Most quoted products:
With prob: 0.00012370356867136024, you take product with code: 0385504209
With prob: 8.585313268231032e-05, you take product with code: 0316666343
With prob: 8.268539688775729e-05, you take product with code: 0446676098
With prob: 7.413220963590887e-05, you take product with code: 0671027360
With prob: 7.141174121949539e-05, you take product with code: 0316769487
With prob: 6.886451556900007e-05, you take product with code: 0446677450
With prob: 6.798706597747659e-05, you take product with code: 043935806X
With prob: 6.698272085881588e-05, you take product with code: 0399144463
With prob: 6.690861564040776e-05, you take product with code: 0060392452
With prob: 6.63062405431688e-05, you take product with code: 1592400876


In [48]:
# Define the list of values you want to filter on in column 3
values_to_filter = []
for index, (item, prob) in enumerate(sorted_TSPR_less3):
    if index > 10:
        break
    values_to_filter.append(item)

In [49]:
# Join the code and book name with the average rating
names_TSPR_less3 = books.filter(lambda x: x.split('\t')[3] in values_to_filter).map(lambda x: (x.split('\t')[3], x.split('\t')[5])).distinct()
join_less3 = names_TSPR_less3.join(average_ratings)

In [50]:
# Show the resulting RDD, containing book code, title and average rating
for i in values_to_filter:
  name = join_less3.filter(lambda x: x[0] == i)
  print(name.take(1))

[('0385504209', ('The Da Vinci Code', 3.562))]
[('0316666343', ('The Lovely Bones', 3.692))]
[('0446676098', ('The Notebook', 3.944))]
[('0671027360', ('Angels & Demons', 3.643))]
[('0316769487', ('The Catcher in the Rye', 4.32))]
[('0446677450', ('Rich Dad, Poor Dad: What the Rich Teach Their Kids About Money--That the Poor and Middle Class Do Not!', 3.857))]
[('043935806X', ('Harry Potter and the Order of the Phoenix (Book 5)', 4.324))]
[('0399144463', ('Who Moved My Cheese?: An Amazing Way to Deal with Change in Your Work and in Your Life', 3.143))]
[('0060392452', ('Stupid White Men ...And Other Sorry Excuses for the State of the Nation!', 3.364))]
[('1592400876', ('Eats, Shoots & Leaves: The Zero Tolerance Approach to Punctuation', 3.778))]
