In [86]:
from operator import add
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, lit
from pyspark.sql.types import StructType, StructField, StringType

NUMBER_ITERATION = 1

In [84]:
def compute_contrib(children, rank):
    number_children = len(children)
    for c in children:
        yield c, rank / number_children

In [85]:
def csv_reader(s):
    elements = s.split(';')
    return (elements[0].strip('"'), elements[1].strip('"').split(' '))

In [112]:
def PageRank_RDD(nombre_iteration:int, data_path:str="../data/small_sample.csv"):
    # Creation de l'application
    spark = SparkSession.builder.appName("pagerank_rdd").getOrCreate()
    sc = spark.sparkContext
    
    data = sc.textFile(data_path).map(csv_reader)
    data = data.partitionBy(sc.defaultParallelism).persist()
    print('Parallelism:', sc.defaultParallelism)

    # Initial ranks: 1.0 pour chaque node présent dans adj keys ou values
    link_src = data.keys()
    link_dst = data.flatMap(lambda row: row[1])
    ranks = link_src.union(link_dst).distinct().map(lambda url:(url,1.0))

    # Partitionnement
    ranks = ranks.partitionBy(sc.defaultParallelism).persist()
    ranks.count()

    # Calcul du page rank
    for iteration in range(nombre_iteration):
        ranks = data.join(ranks)\
            .flatMap(lambda row: compute_contrib(row[1][0], row[1][1]))\
            .reduceByKey(add)\
            .mapValues(lambda rank : rank * 0.85 + 0.15)

    return ranks.sortBy(lambda elm: elm[1], ascending=False)

In [113]:
PageRank_RDD(4, '../data/wikilinks.csv').collect()

Parallelism: 2


                                                                                

[('', 2.5660187499999996),
 ('<http://dbpedia.org/resource/Kinsey_Millhone>', 0.3579343211059741),
 ('<http://dbpedia.org/resource/Santa_Teresa_(fictional_city)>',
  0.3554557491204803),
 ('<http://dbpedia.org/resource/!!>', 0.353642252),
 ('<http://dbpedia.org/resource/Category:Kinsey_Millhone_novels>',
  0.3494588733552951),
 ('<http://dbpedia.org/resource/Sue_Grafton>', 0.3494588733552951),
 ('<http://dbpedia.org/resource/Category:Novels_by_Sue_Grafton>',
  0.3494588733552951),
 ('<http://dbpedia.org/resource/Mystery_fiction>', 0.3494588733552951),
 ('<http://dbpedia.org/resource/Category:Novels_set_in_California>',
  0.3313701970179725),
 ('<http://dbpedia.org/resource/Chess_annotation_symbols>', 0.3275388064),
 ('<http://dbpedia.org/resource/Donnie_Vie>', 0.2889416545),
 ('<http://dbpedia.org/resource/%22Weird_Al%22_Yankovic_Live!:_The_Alpocalypse_Tour>',
  0.2789607496410423),
 ('<http://dbpedia.org/resource/Category:Henry_Holt_and_Company_books>',
  0.2786619039514565),
 ('<http