## MLib - Machine learning library

In [None]:
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
#from pyspark.ml.feature import Tokenizer
#from pyspark.ml.feature import StopWordsRemover
#from pyspark.ml.feature import HashingTF
#from pyspark.ml.feature import IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel
from pyspark.sql.types import *

In [None]:
# input and output folders
trainingData = "trainingData.csv"
unlabeledData = "unlabeledData.csv"
outputPath = "./"

In [None]:
# *************************
# Training step
# *************************

# Create a DataFrame from trainingData.csv
# Training data in raw format
trainingData = spark.read.load(trainingData,\
                     format="csv",\
                     header=True,\
                     inferSchema=True)

In [None]:
trainingData.printSchema()
trainingData.show()

In [None]:
root
 |-- label: integer (nullable = true)
 |-- text: string (nullable = true)

+-----+--------------------+
|label|                text|
+-----+--------------------+
|    1|The Spark system ...|
|    1|Spark is a new di...|
|    0|Turin is a beauti...|
|    0|Turin is in the n...|
+-----+--------------------+

In [None]:
# Define a Python function that returns the number of words occurring in the input string
def countWords(text):
    return len(text.split(" "))

In [None]:
# Register a UDF function associated with countWords
# We explicitly report also the data type of the returned value
spark.udf.register("countWords", countWords, IntegerType())

In [None]:
# Define a Python function that checks if the input string contain the work "Spark"
def containsSpark(text):
    return text.find("Spark")>=0

In [None]:
# Register a UDF function associated with containsSpark
# We explicitly report also the data type of the returned value
spark.udf.register("containsSpark", containsSpark, BooleanType())

In [None]:
# Select the attributes label and text and create  two new columns:
# numLines and SparkWord
newFeaturesDF = trainingData\
.selectExpr("label", "text", "countWords(text)", "containsSpark(text)")\
.withColumnRenamed("countWords(text)", "numLines")\
.withColumnRenamed("containsSpark(text)", "SparkWord")

# OR
# Create an SQLTransformer to add two column to the input dataframe:
# numLines and SparkWord
sqlTrans = SQLTransformer(statement="""SELECT *,
countWords(text) AS numLines,
containsSpark(text) AS SparkWord
FROM __THIS__""")

In [None]:
newFeaturesDF.printSchema()
newFeaturesDF.show()

In [None]:
root
 |-- label: integer (nullable = true)
 |-- text: string (nullable = true)
 |-- numLines: integer (nullable = true)
 |-- SparkWord: boolean (nullable = true)

+-----+--------------------+--------+---------+
|label|                text|numLines|SparkWord|
+-----+--------------------+--------+---------+
|    1|The Spark system ...|       7|     true|
|    1|Spark is a new di...|       6|     true|
|    0|Turin is a beauti...|       5|    false|
|    0|Turin is in the n...|       8|    false|
+-----+--------------------+--------+---------+

In [None]:
# Use an assembler to combine "numLines" and "SparkWord" in a Vector
assembler = VectorAssembler(inputCols=["numLines", "SparkWord"],\
                            outputCol="features")

In [None]:
# Create a classification model based on the logistic regression algorithm
# We can set the values of the parameters of the 
# Logistic Regression algorithm using the setter methods.
lr = LogisticRegression()\
.setMaxIter(10)\
.setRegParam(0.01)

In [None]:
# Define the pipeline that is used to create the logistic regression
# model on the training data.
pipeline = Pipeline().setStages([assembler, lr])

In [None]:
# Execute the pipeline on the training data to build the 
# classification model
classificationModel = pipeline.fit(newFeaturesDF)

# Now, the classification model can be used to predict the class label
# of new unlabeled data

In [None]:
# *************************
# Prediction  step
# *************************
# Read unlabeled data
# Create a DataFrame from unlabeledData.csv
# Unlabeled data in raw format
unlabeledData = spark.read.load(unlabeledData,\
                     format="csv", header=True, inferSchema=True)

In [None]:
#unlabeledData.printSchema()
#unlabeledData.show()

In [None]:
newFeaturesDFunlabeled = unlabeledData\
.selectExpr("label", "text", "countWords(text)", "containsSpark(text)")\
.withColumnRenamed("countWords(text)", "numLines")\
.withColumnRenamed("containsSpark(text)", "SparkWord")

In [None]:
newFeaturesDFunlabeled.printSchema()
newFeaturesDFunlabeled.show()

In [None]:
root
 |-- label: string (nullable = true)
 |-- text: string (nullable = true)
 |-- numLines: integer (nullable = true)
 |-- SparkWord: boolean (nullable = true)

+-----+--------------------+--------+---------+
|label|                text|numLines|SparkWord|
+-----+--------------------+--------+---------+
| null|Spark performs be...|       5|     true|
| null|Comparison betwee...|       5|     true|
| null|Turin is in Piedmont|       4|    false|
+-----+--------------------+--------+---------+

In [None]:
# Make predictions on unlabeled documents by using the 
# Transformer.transform() method.
# The transform will only use the 'features' columns
predictionsDF = classificationModel.transform(newFeaturesDFunlabeled)

In [None]:
predictionsDF.printSchema()
predictionsDF.show()

In [None]:
root
 |-- label: string (nullable = true)
 |-- text: string (nullable = true)
 |-- numLines: integer (nullable = true)
 |-- SparkWord: boolean (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)

+-----+--------------------+--------+---------+---------+--------------------+--------------------+----------+
|label|                text|numLines|SparkWord| features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------+---------+---------+--------------------+--------------------+----------+
| null|Spark performs be...|       5|     true|[5.0,1.0]|[-3.1272480248757...|[0.04199718899423...|       1.0|
| null|Comparison betwee...|       5|     true|[5.0,1.0]|[-3.1272480248757...|[0.04199718899423...|       1.0|
| null|Turin is in Piedmont|       4|    false|[4.0,0.0]|[3.19966999960023...|[0.96082185681571...|       0.0|
+-----+--------------------+--------+---------+---------+--------------------+--------------------+----------+

In [None]:
# The returned DataFrame has the following schema (attributes)
# |-- label: string (nullable = true)
# |-- text: string (nullable = true)
# |-- words: array (nullable = true)
# |    |-- element: string (containsNull = true)
# |-- filteredWords: array (nullable = true)
# |    |-- element: string (containsNull = true)
# |-- rawFeatures: vector (nullable = true)
# |-- features: vector (nullable = true)
# |-- rawPrediction: vector (nullable = true)
# |-- probability: vector (nullable = true)
# |-- prediction: double (nullable = false)

# Select only the original features (i.e., the value of the original text attribute) and 
# the predicted class for each record
predictions = predictionsDF.select("text", "prediction")

In [None]:
predictions.printSchema()
predictions.show(truncate=False)

In [None]:
root
 |-- text: string (nullable = true)
 |-- prediction: double (nullable = false)

+-----------------------------------+----------+
|text                               |prediction|
+-----------------------------------+----------+
|Spark performs better than Hadoop  |1.0       |
|Comparison between Spark and Hadoop|1.0       |
|Turin is in Piedmont               |0.0       |
+-----------------------------------+----------+

In [None]:
# Save the result in an HDFS output folder
predictions.write.csv(outputPath, header="true")

## GraphFrame

### Compute the number of followers

In [None]:
from graphframes import GraphFrame

In [None]:
inputPathVertexes = "vertexes.csv"
inputPathEdges = "edges.csv"
outputPath = "./"

In [None]:
# Read the content of vertexes.csv
vDF = spark.read.load(inputPathVertexes,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [None]:
vDF.printSchema()
vDF.show()

In [None]:
root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)

+---+-----+---+
| id| name|age|
+---+-----+---+
| u1|Alice| 34|
| u2|  Bob| 36|
| u3| John| 30|
| u4|David| 29|
| u5| Paul| 32|
| u6| Adel| 36|
| u7| Eddy| 60|
+---+-----+---+

In [None]:
# Read the content of edges.csv
eDF = spark.read.load(inputPathEdges,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [None]:
eDF.printSchema()
eDF.show()

In [None]:
root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- linktype: string (nullable = true)

+---+---+--------+
|src|dst|linktype|
+---+---+--------+
| u1| u2|  friend|
| u1| u4|  friend|
| u1| u5|  friend|
| u2| u1|  friend|
| u2| u3|  follow|
| u3| u2|  follow|
| u4| u1|  friend|
| u4| u5|  friend|
| u5| u1|  friend|
| u5| u4|  friend|
| u5| u6|  follow|
| u6| u3|  follow|
| u7| u6|  follow|
+---+---+--------+

In [None]:
# Only the "follow" edges are useful
# Filter the input edge dataframe before creating the graph
filteredEdges = eDF.filter("linktype='follow' ")

In [None]:
# Create the input graph
g = GraphFrame(vDF, filteredEdges)

In [None]:
# Count the number of followers for each user (i.e., the number of in-links) 
userNumFollowersDF = g.inDegrees.withColumnRenamed("inDegree","numFollowers")

In [None]:
userNumFollowersDF.printSchema()
userNumFollowersDF.show()

In [None]:
root
 |-- id: string (nullable = true)
 |-- numFollowers: integer (nullable = false)

+---+------------+
| id|numFollowers|
+---+------------+
| u3|           2|
| u6|           2|
| u2|           1|
+---+------------+

In [None]:
# Save the result in the output folder
userNumFollowersDF.write.csv(outputPath, header=True)

### Extract users with the maximum number of followers

In [None]:
inputPathVertexes = "vertexes.csv"
inputPathEdges = "edges.csv"
outputPath = "./"

In [None]:
# Read the content of vertexes.csv
vDF = spark.read.load(inputPathVertexes,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [None]:
# Read the content of edges.csv
eDF = spark.read.load(inputPathEdges,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [None]:
# Only the "follow" edges are useful
# Filter the input edge dataframe before creating the graph
filteredEdfes = eDF.filter("linktype='follow' ")

In [None]:
# Create the input graph
g = GraphFrame(vDF, filteredEdfes)

In [None]:
# Count the number of followers for each user (i.e., the number of in-links) 
userNumFollowersDF = g.inDegrees.withColumnRenamed("inDegree","numFollowers")

In [None]:
# Select the user(s) with the maximum number of followers

In [None]:
# Compute the maximum value of "number of followers"
maxFollowersDF = userNumFollowersDF.agg({"numFollowers":"max"})\
.withColumnRenamed("max(numFollowers)","maxFollowers")

In [None]:
# Select the single record/Row of the DataFrame maxFollowersDF
rowMaxNumFollowers = maxFollowersDF.first()

In [None]:
# Retrieve the maximum number of followers from rowMaxNumFollowers
maxNumFollowers=rowMaxNumFollowers.maxFollowers

In [None]:
# Select the user(s) with the maximum number of followers
selectedUsersDF=userNumFollowersDF.filter(userNumFollowersDF.numFollowers==maxNumFollowers)
# Or
#electedUsersDF=userNumFollowersDF.filter("numFollowers="+str(maxNumFollowers))

In [None]:
selectedUsersDF.printSchema()
selectedUsersDF.show()

In [None]:
root
 |-- id: string (nullable = true)
 |-- numFollowers: integer (nullable = false)

+---+------------+
| id|numFollowers|
+---+------------+
| u3|           2|
| u6|           2|
+---+------------+

In [None]:
# Save the result in the output folder
selectedUsersDF.write.csv(outputPath, header=True)

## Motif finding

In [None]:
inputPathVertexes = "vertexes.csv"
inputPathEdges = "edges.csv"
outputPath = "./"

In [None]:
# Read the content of vertexes.csv
vDF = spark.read.load(inputPathVertexes,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [None]:
# Read the content of edges.csv
eDF = spark.read.load(inputPathEdges,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [None]:
# Select only links of type friend
filteredEdges = eDF.filter("linktype='friend' ")

In [None]:
# Create the input graph
g = GraphFrame(vDF, filteredEdges)

In [None]:
# Select the paths/motifs (v1)-[]->(v2);!(v2)-[]->(v1)
selectedPaths = g.find("(userx)-[]->(usery);!(usery)-[]->(userx)")

In [None]:
selectedPaths.printSchema()
selectedPaths.show()

In [None]:
root
 |-- userx: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- age: integer (nullable = true)
 |-- usery: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- age: integer (nullable = true)

+---------------+---------------+
|          userx|          usery|
+---------------+---------------+
|[u4, David, 29]|[u1, Alice, 34]|
|[u1, Alice, 34]|  [u2, Bob, 36]|
+---------------+---------------+

In [None]:
# Select only the ids of the users (rename the selected columns)
selectedPairsDF=selectedPaths.selectExpr("userx.id as IdFriend","usery.id as IdNotFriend")

In [None]:
selectedPairsDF.printSchema()
selectedPairsDF.show()

In [None]:
root
 |-- IdFriend: string (nullable = true)
 |-- IdNotFriend: string (nullable = true)

+--------+-----------+
|IdFriend|IdNotFriend|
+--------+-----------+
|      u4|         u1|
|      u1|         u2|
+--------+-----------+

In [None]:
# Save the result in the output folder
selectedPairsDF.write.csv(outputPath, header=True)

### Select users and topics of interest

In [None]:
inputPathVertexes = "vertexes.csv"
inputPathEdges = "edges.csv"
outputPath = "./"

In [None]:
# Read the content of vertexes.csv
vDF = spark.read.load(inputPathVertexes,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [None]:
vDF.printSchema()
vDF.show()

In [None]:
root
 |-- id: string (nullable = true)
 |-- entityName: string (nullable = true)
 |-- name: string (nullable = true)

+---+----------+--------+
| id|entityName|    name|
+---+----------+--------+
| V1|      user|   Paolo|
| V2|     topic|     SQL|
| V3|      user|   David|
| V4|     topic|Big Data|
| V5|      user|    John|
+---+----------+--------+

In [None]:
# Read the content of edges.csv
eDF = spark.read.load(inputPathEdges,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [None]:
eDF.printSchema()
eDF.show()

In [None]:
root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- linktype: string (nullable = true)

+---+---+----------+
|src|dst|  linktype|
+---+---+----------+
| V1| V2|      like|
| V1| V3|    follow|
| V1| V4|    follow|
| V3| V2|      like|
| V3| V4|      like|
| V5| V2|  expertOf|
| V2| V4|correlated|
| V4| V2|correlated|
+---+---+----------+

In [None]:
# Only the "follow" edges are useful
# Filter the input edge dataframe before creating the graph
filteredEdfes = eDF.filter("linktype='follow' ")

In [None]:
# Create the input graph
g = GraphFrame(vDF, filteredEdfes)

In [None]:
pathsDF = g.find("(v1)-[]->(v2)")

In [None]:
pathsDF.printSchema()
pathsDF.show()

In [None]:
root
 |-- v1: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- v2: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)

+-----------------+--------------------+
|               v1|                  v2|
+-----------------+--------------------+
|[V1, user, Paolo]|   [V3, user, David]|
|[V1, user, Paolo]|[V4, topic, Big D...|
|[V3, user, David]|    [V2, topic, SQL]|
|[V3, user, David]|[V4, topic, Big D...|
+-----------------+--------------------+

In [None]:
# Select the pair (user,topic)
selectedPathsDF = pathsDF.filter("v1.entityName='user' AND v2.entityName='topic' ")

In [None]:
selectedPathsDF.printSchema()
selectedPathsDF.show()

In [None]:
root
 |-- v1: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- v2: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)

+-----------------+--------------------+
|               v1|                  v2|
+-----------------+--------------------+
|[V1, user, Paolo]|[V4, topic, Big D...|
|[V3, user, David]|    [V2, topic, SQL]|
|[V3, user, David]|[V4, topic, Big D...|
+-----------------+--------------------+

In [None]:
# Select name of the user and "name" of the topic
userTopicDF = selectedPathsDF.selectExpr("v1.name as username", "v2.name as topic")

In [None]:
userTopicDF.show()

In [None]:
+--------+--------+
|username|   topic|
+--------+--------+
|   Paolo|Big Data|
|   David|     SQL|
|   David|Big Data|
+--------+--------+

In [None]:
# Save the result in the output folder
userTopicDF.write.csv(outputPath, header=True)

### Filter users by edge type and topic

In [None]:
inputPathVertexes = "vertexes.csv"
inputPathEdges = "edges.csv"
outputPath = "./"

In [None]:
# Read the content of vertexes.csv
vDF = spark.read.load(inputPathVertexes,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [None]:
vDF.printSchema()
vDF.show()

In [None]:
root
 |-- id: string (nullable = true)
 |-- entityName: string (nullable = true)
 |-- name: string (nullable = true)

+---+----------+--------+
| id|entityName|    name|
+---+----------+--------+
| V1|      user|   Paolo|
| V2|     topic|     SQL|
| V3|      user|   David|
| V4|     topic|Big Data|
| V5|      user|    John|
+---+----------+--------+

In [None]:
# Read the content of edges.csv
eDF = spark.read.load(inputPathEdges,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [None]:
eDF.printSchema()
eDF.show()

In [None]:
root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- linktype: string (nullable = true)

+---+---+----------+
|src|dst|  linktype|
+---+---+----------+
| V1| V2|      like|
| V1| V3|    follow|
| V1| V4|    follow|
| V3| V2|    follow|
| V3| V4|    follow|
| V5| V2|  expertOf|
| V2| V4|correlated|
| V4| V2|correlated|
+---+---+----------+

In [None]:
# Only the "follow"  and "correlated" edges are useful
# Filter the input edge dataframe before creating the graph
filteredEdfes = eDF.filter("linktype='follow' OR linktype='correlated' ")

In [None]:
# Create the input graph
g = GraphFrame(vDF, filteredEdfes)

In [None]:
pathsDF = g.find("(v1)-[e1]->(v2);(v2)-[e2]->(v3)")

In [None]:
pathsDF.printSchema()
pathsDF.show()

In [None]:
root
 |-- v1: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- e1: struct (nullable = false)
 |    |-- src: string (nullable = true)
 |    |-- dst: string (nullable = true)
 |    |-- linktype: string (nullable = true)
 |-- v2: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- e2: struct (nullable = false)
 |    |-- src: string (nullable = true)
 |    |-- dst: string (nullable = true)
 |    |-- linktype: string (nullable = true)
 |-- v3: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  v1|                  e1|                  v2|                  e2|                  v3|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|   [V1, user, Paolo]|    [V1, V3, follow]|   [V3, user, David]|    [V3, V4, follow]|[V4, topic, Big D...|
|   [V1, user, Paolo]|    [V1, V3, follow]|   [V3, user, David]|    [V3, V2, follow]|    [V2, topic, SQL]|
|   [V1, user, Paolo]|    [V1, V4, follow]|[V4, topic, Big D...|[V4, V2, correlated]|    [V2, topic, SQL]|
|   [V3, user, David]|    [V3, V2, follow]|    [V2, topic, SQL]|[V2, V4, correlated]|[V4, topic, Big D...|
|   [V3, user, David]|    [V3, V4, follow]|[V4, topic, Big D...|[V4, V2, correlated]|    [V2, topic, SQL]|
|    [V2, topic, SQL]|[V2, V4, correlated]|[V4, topic, Big D...|[V4, V2, correlated]|    [V2, topic, SQL]|
|[V4, topic, Big D...|[V4, V2, correlated]|    [V2, topic, SQL]|[V2, V4, correlated]|[V4, topic, Big D...|
+--------------------+--------------------+--------------------+--------------------+--------------------+

In [None]:
# Select the triples user -> follow -> topic -> correlated -> topic="Big data"
selectedPathsDF = pathsDF.filter("""v1.entityName='user' 
AND e1.linktype='follow'
AND v2.entityName='topic'
AND e2.linktype='correlated'
AND v3.entityName='topic' AND v3.name='Big Data' """)

In [None]:
selectedPathsDF.printSchema()
selectedPathsDF.show()

In [None]:
root
 |-- v1: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- e1: struct (nullable = false)
 |    |-- src: string (nullable = true)
 |    |-- dst: string (nullable = true)
 |    |-- linktype: string (nullable = true)
 |-- v2: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- e2: struct (nullable = false)
 |    |-- src: string (nullable = true)
 |    |-- dst: string (nullable = true)
 |    |-- linktype: string (nullable = true)
 |-- v3: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- entityName: string (nullable = true)
 |    |-- name: string (nullable = true)

+-----------------+----------------+----------------+--------------------+--------------------+
|               v1|              e1|              v2|                  e2|                  v3|
+-----------------+----------------+----------------+--------------------+--------------------+
|[V3, user, David]|[V3, V2, follow]|[V2, topic, SQL]|[V2, V4, correlated]|[V4, topic, Big D...|
+-----------------+----------------+----------------+--------------------+--------------------+

In [None]:
# Select name of the selected users
usersDF = selectedPathsDF.selectExpr("v1.name as username")

In [None]:
usersDF.show()

In [None]:
+--------+
|username|
+--------+
|   David|
+--------+

In [None]:
# Save the result in the output folder
usersDF.write.csv(outputPath, header=True)

### Select closest nodes to user1

In [None]:
inputPathVertexes = "vertexes.csv"
inputPathEdges = "edges.csv"
outputPath = "./"

In [None]:
# Read the content of vertexes.csv
vDF = spark.read.load(inputPathVertexes,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [None]:
vDF.printSchema()
vDF.show()

In [None]:
root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)

+---+-----+---+
| id| name|age|
+---+-----+---+
| u1|Alice| 34|
| u2|  Bob| 36|
| u3| John| 30|
| u4|David| 29|
| u5| Paul| 32|
| u6| Adel| 36|
| u7| Eddy| 60|
+---+-----+---+

In [None]:
# Read the content of edges.csv
eDF = spark.read.load(inputPathEdges,\
                             format="csv",
                             header=True,\
                             inferSchema=True)

In [None]:
eDF.printSchema()
eDF.show()

In [None]:
root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- linktype: string (nullable = true)

+---+---+--------+
|src|dst|linktype|
+---+---+--------+
| u1| u2|  friend|
| u1| u4|  friend|
| u1| u5|  friend|
| u2| u1|  friend|
| u2| u3|  follow|
| u3| u2|  follow|
| u4| u1|  friend|
| u4| u5|  friend|
| u5| u1|  friend|
| u5| u4|  friend|
| u5| u6|  follow|
| u6| u3|  follow|
+---+---+--------+

In [None]:
# Create the input graph
g = GraphFrame(vDF, eDF)

In [None]:
# Compute for each vertex the length of the shortest path to u1
shortPathsLengDF = g.shortestPaths(['u1'])

In [None]:
shortPathsLengDF.printSchema()
shortPathsLengDF.show()

In [None]:
root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- distances: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = false)

+---+-----+---+---------+
| id| name|age|distances|
+---+-----+---+---------+
| u6| Adel| 36|[u1 -> 3]|
| u3| John| 30|[u1 -> 2]|
| u2|  Bob| 36|[u1 -> 1]|
| u4|David| 29|[u1 -> 1]|
| u5| Paul| 32|[u1 -> 1]|
| u1|Alice| 34|[u1 -> 0]|
| u7| Eddy| 60|       []|
+---+-----+---+---------+

In [None]:
# Select only the users who can reach u1 in less than 3 "hops"
selectedUsersDF=shortPathsLengDF.filter("distances['u1']<3 AND id<>'u1' ")

In [None]:
selectedUsersDF.printSchema()
selectedUsersDF.show()

In [None]:
root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- distances: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = false)

+---+-----+---+---------+
| id| name|age|distances|
+---+-----+---+---------+
| u3| John| 30|[u1 -> 2]|
| u2|  Bob| 36|[u1 -> 1]|
| u4|David| 29|[u1 -> 1]|
| u5| Paul| 32|[u1 -> 1]|
+---+-----+---+---------+

In [None]:
# Create a DataFrame with Columns name and numHops
usersNameNumHopsDF=selectedUsersDF.selectExpr("name", "distances['u1'] AS numHops")

In [None]:
usersNameNumHopsDF.printSchema()
usersNameNumHopsDF.show()

In [None]:
root
 |-- name: string (nullable = true)
 |-- numHops: integer (nullable = true)

+-----+-------+
| name|numHops|
+-----+-------+
| John|      2|
|  Bob|      1|
|David|      1|
| Paul|      1|
+-----+-------+

In [None]:
# Save the result in the output folder
usersNameNumHopsDF.write.csv(outputPath, header=True)