# Spark GraphFrames

In [None]:
import numpy as np

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [None]:
from pyspark.sql.types import *
from graphframes import *

In [None]:
import graphframes

**Notes on installing GraphFrames**

I needed to put the GraphFrames jar file into the Spark jars director for it to work.

```bash
cd /usr/local/spark/jars/
wget http://dl.bintray.com/spark-packages/maven/graphframes/graphframes/0.7.0-spark2.4-s_2.11/graphframes-0.7.0-spark2.4-s_2.11.jar
```

In [None]:
import networkx as nx
from networkx.drawing.nx_pydot import graphviz_layout

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
G = nx.les_miserables_graph()

In [None]:
plt.figure(figsize=(14, 6))
nx.draw_kamada_kawai(
    G, 
    with_labels=True, 
    node_size=500, 
    node_color='lightblue' 
)

In [None]:
edges = [(a, b, np.random.randint(1, 4)) for (a, b) in nx.edges(G)]

In [None]:
nodes = [(name, name, np.random.randint(15, 60)) for name in nx.nodes(G)]

In [None]:
edges[:3]

In [None]:
nodes[:3]

## Creation of a GraphFrame

- Need to provide a DataFrame of vertices, and a DataFrame of edges

### Vertices must contain an id column

In [None]:
v = spark.createDataFrame(nodes, ['id', 'name', 'age'])

### Edges must have src and dst columns that contain vertex ids

In [None]:
e = spark.createDataFrame(edges, ['src', 'dst', 'wt'])

In [None]:
g = GraphFrame(v, e)

## Basic information

In [None]:
g

In [None]:
g.vertices.show(5)

In [None]:
g.edges.show(5)

## Search

In [None]:
from pyspark.sql.functions import col

In [None]:
g.vertices.show(5)

### Can use regular DataFrame functions

In [None]:
g.vertices.filter(col('name').startswith('C')).show(5)

In [None]:
g.edges.groupBy('wt').count().show()

## Motifs

In [None]:
g.find('(a)').show(5)

In [None]:
g.find('(a)-[e]->(b)').show(5, truncate=False)

In [None]:
g.find('(a)-[e1]->(b); (b)-[e2]->(c)').select('e1', 'e2').show(5, truncate=False)

In [None]:
g.find('(a)-[]->(b); !(b)-[]->(a)').show(5, truncate=False)

In [None]:
g.find('(a)-[]->(b); (b)-[]->(a)').show(5, truncate=False)

## Extracting subgraphs

In [None]:
g1 = g.filterVertices("age > 30").filterEdges('wt < 3').dropIsolatedVertices()

In [None]:
g.vertices.count()

In [None]:
g1.vertices.count()

## Pathfinding

In [None]:
path = g.bfs("name = 'Cosette'", "age > 30")

In [None]:
path.show(truncate=False)

In [None]:
path = g.bfs("name = 'Cosette'", "age = 50")

In [None]:
path.show(truncate=False)

In [None]:
paths = g.shortestPaths(landmarks = ['Cosette'])

In [None]:
path.show(truncate=False)

## Centrality

In [None]:
g.degrees.show(5)

In [None]:
g.indegrees.show(5)

In [None]:
g.outdegrees.show(5)

In [None]:
res = g.pageRank(resetProbability=0.15, maxIter=5)

In [None]:
res.show(truncate=False)

In [None]:
res = g.pageRank(resetProbability=0.15, maxIter=5, sourceIds=['Cosette', 'Napolean'])

In [None]:
res.show(truncate=False)

## Community detection

In [None]:
sc = spark.sparkContext
sc.setCheckpointDir("checkpoint")

In [None]:
g1 = g.filterEdges('wt = 1')

In [None]:
res = g1.connectedComponents()

In [None]:
res.show(truncate=False)

In [None]:
res = g1.stronglyConnectedComponents(maxIter=5)

In [None]:
res.show(truncate=False)

In [None]:
res = g.triangleCount()

In [None]:
res.show(truncate=False)

In [None]:
res = g.labelPropagation(maxIter=5)

In [None]:
res.show(truncate=False)