# GraphFrames example - 01



In [1]:
from graphframes import GraphFrame

In [2]:
v = spark.createDataFrame([
	("a", "Alice", 34), 
	("b", "Bob", 36), 
	("c", "Charlie", 30), 
	("d", "David", 29), 
	("e", "Esther", 32), 
	("f", "Fanny", 36), 
	("g", "Gabby", 60) 
], ["id", "name", "age"])

In [3]:
e = spark.createDataFrame([ 
	("a", "b", "friend"), 
	("b", "c", "follow"), 
	("c", "b", "follow"), 
	("f", "c", "follow"), 
	("e", "f", "follow"), 
	("e", "d", "friend"), 
	("d", "a", "friend"), 
	("a", "e", "friend") 
], ["src", "dst", "relationship"]) 

In [4]:
g1 = GraphFrame(v, e)

In [5]:
g1.vertices.show()

                                                                                

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  a|  Alice| 34|
|  b|    Bob| 36|
|  c|Charlie| 30|
|  d|  David| 29|
|  e| Esther| 32|
|  f|  Fanny| 36|
|  g|  Gabby| 60|
+---+-------+---+



In [6]:
g1.edges.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
|  b|  c|      follow|
|  c|  b|      follow|
|  f|  c|      follow|
|  e|  f|      follow|
|  e|  d|      friend|
|  d|  a|      friend|
|  a|  e|      friend|
+---+---+------------+



## Find n. of edges and n. of vertices

In [7]:
print(f"Number of edges: {g1.edges.count()}")
print(f"Number of vertices: {g1.vertices.count()}")

Number of edges: 8
Number of vertices: 7


## Find lowest age among the vertices

In [8]:
g1.vertices.sort("age").show(1)

+---+-----+---+
| id| name|age|
+---+-----+---+
|  d|David| 29|
+---+-----+---+
only showing top 1 row



## Count number of "follow" edges in the graph

In [9]:
n_follow = g1.edges.filter("relationship == 'follow'").count()
print(f"Number of 'follow' edges: {n_follow}")

Number of 'follow' edges: 4


## *Motif finding* - find all users which follow each other

In [10]:
g1.find("(a)-[]->(b); (b)-[]->(a)").show()



+----------------+----------------+
|               a|               b|
+----------------+----------------+
|[c, Charlie, 30]|    [b, Bob, 36]|
|    [b, Bob, 36]|[c, Charlie, 30]|
+----------------+----------------+



                                                                                

## **!!** ***Motif finding* - find chains of 4 users in which at least 2 of the 3 connections are 'friend'**

In [11]:
fourInARow = g1.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3); (v3)-[e3]->(v4)")
fourInARow.show()

                                                                                

+----------------+--------------+----------------+--------------+----------------+--------------+----------------+
|              v1|            e1|              v2|            e2|              v3|            e3|              v4|
+----------------+--------------+----------------+--------------+----------------+--------------+----------------+
|  [d, David, 29]|[d, a, friend]|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|
| [e, Esther, 32]|[e, d, friend]|  [d, David, 29]|[d, a, friend]|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|
|  [d, David, 29]|[d, a, friend]|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|[e, d, friend]|  [d, David, 29]|
|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|
|  [f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|[c, b, follow]|    [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|
|    [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|[c, b, follow]|    [b, Bob, 36

In [15]:
def cond2Of3(e1, e2, e3):
    a = (e1.relationship == "friend")
    b = (e2.relationship == "friend")
    c = (e3.relationship == "friend")
    
    return (int(a)+int(b)+int(c) >= 2)

from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf

#udf(cond2Of3, BooleanType())
spark.udf.register('cond', cond2Of3, BooleanType())

fourInARow.filter('cond(fourInARow.e1, fourInARow.e2, fourInARow.e3)').show()

22/12/17 10:00:56 WARN analysis.SimpleFunctionRegistry: The function cond replaced a previously registered function.


AnalysisException: "cannot resolve '`fourInARow.e1`' given input columns: [e2, v4, v3, v1, e3, e1, v2]; line 1 pos 5;\n'Filter 'cond('fourInARow.e1, 'fourInARow.e2, 'fourInARow.e3)\n+- Project [v1#121, e1#119, v2#123, e2#144, v3#146, e3#179, v4#181]\n   +- Join Inner, (e3#179.dst = v4#181.id)\n      :- Join Inner, (e3#179.src = v3#146.id)\n      :  :- Join Inner, (e2#144.dst = v3#146.id)\n      :  :  :- Join Inner, (e2#144.src = v2#123.id)\n      :  :  :  :- Join Inner, (e1#119.dst = v2#123.id)\n      :  :  :  :  :- Join Inner, (e1#119.src = v1#121.id)\n      :  :  :  :  :  :- Project [named_struct(src, src#6, dst, dst#7, relationship, relationship#8) AS e1#119]\n      :  :  :  :  :  :  +- LogicalRDD [src#6, dst#7, relationship#8], false\n      :  :  :  :  :  +- Project [named_struct(id, id#0, name, name#1, age, age#2L) AS v1#121]\n      :  :  :  :  :     +- LogicalRDD [id#0, name#1, age#2L], false\n      :  :  :  :  +- Project [named_struct(id, id#0, name, name#1, age, age#2L) AS v2#123]\n      :  :  :  :     +- LogicalRDD [id#0, name#1, age#2L], false\n      :  :  :  +- Project [named_struct(src, src#6, dst, dst#7, relationship, relationship#8) AS e2#144]\n      :  :  :     +- LogicalRDD [src#6, dst#7, relationship#8], false\n      :  :  +- Project [named_struct(id, id#0, name, name#1, age, age#2L) AS v3#146]\n      :  :     +- LogicalRDD [id#0, name#1, age#2L], false\n      :  +- Project [named_struct(src, src#6, dst, dst#7, relationship, relationship#8) AS e3#179]\n      :     +- LogicalRDD [src#6, dst#7, relationship#8], false\n      +- Project [named_struct(id, id#0, name, name#1, age, age#2L) AS v4#181]\n         +- LogicalRDD [id#0, name#1, age#2L], false\n"

In [None]:
## Why won't this work???
