## Exercise 52 - inDegrees

In [9]:
from graphframes import GraphFrame

inputPathVertexes = "/data/students/bigdata-01QYD/ex_data/Ex52/data/vertexes.csv"
inputPathEdges = "/data/students/bigdata-01QYD/ex_data/Ex52/data/edges.csv"
outputPath = "resOut_ex52/"

In [7]:
# read vertices and edges
verticesDF = spark.read.load(inputPathVertexes, format="csv", header=True, inferSchema=True)
edgesDF = spark.read.load(inputPathEdges, format="csv", header=True, inferSchema=True)

In [4]:
verticesDF.printSchema()
verticesDF.show()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)

+---+-----+---+
| id| name|age|
+---+-----+---+
| u1|Alice| 34|
| u2|  Bob| 36|
| u3| John| 30|
| u4|David| 29|
| u5| Paul| 32|
| u6| Adel| 36|
| u7| Eddy| 60|
+---+-----+---+



In [5]:
edgesDF.printSchema()
edgesDF.show()

root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- linktype: string (nullable = true)

+---+---+--------+
|src|dst|linktype|
+---+---+--------+
| u1| u2|  friend|
| u1| u4|  friend|
| u1| u5|  friend|
| u2| u1|  friend|
| u2| u3|  follow|
| u3| u2|  follow|
| u4| u1|  friend|
| u4| u5|  friend|
| u5| u1|  friend|
| u5| u4|  friend|
| u5| u6|  follow|
| u6| u3|  follow|
| u7| u6|  follow|
+---+---+--------+



For each user with at least one follower, store in the output folder the number of followers <br />
▪ One user per line <br />
▪ Format: user id, number of followers <br />

In [29]:
# get just the "follow" relationships
filteredEdges = edgesDF.filter("linktype = 'follow'")

# build the "filtered" graph
g = GraphFrame(verticesDF, filteredEdges)

# we don't need to drop the isolated vertices
# because they won't have entering edges
nFollowersDF = g.inDegrees

In [30]:
nFollowersDF = nFollowersDF.withColumnRenamed("inDegree","nFollowers")
nFollowersDF.show()

+---+----------+
| id|nFollowers|
+---+----------+
| u3|         2|
| u6|         2|
| u2|         1|
+---+----------+



In [None]:
nFollowersDF.write.csv(outputPath, format="csv")

## V2 : Less optimized way

In [32]:
# filter the edges once you have already built the graph
g2 = GraphFrame(verticesDF, edgesDF).filterEdges("linktype = 'follow'")

nFollowersDF2 = g2.inDegrees
nFollowersDF2 = nFollowersDF2.withColumnRenamed("inDegree","nFollowers")
nFollowersDF2.show()

+---+----------+
| id|nFollowers|
+---+----------+
| u3|         2|
| u6|         2|
| u2|         1|
+---+----------+

