In [1]:
# Importing Libraries
from pyspark import SparkContext,SQLContext
from graphframes import *
from pyspark.sql import functions as f

In [2]:
# Setting Graphframes package variable
import os
os.environ["PYSPARK_SUBMIT_ARGS"] = ("--packages  graphframes:graphframes:0.8.0-spark2.4-s_2.11 pyspark-shell")

In [3]:
# creating sql context
sc = SparkContext.getOrCreate()
sqlcontext=SQLContext(sc)

In [4]:
# Importing the Datasets
station_df= sqlcontext.read.format("csv").option("header", "true").csv('201508_station_data.csv')
trips_df= sqlcontext.read.format("csv").option("header", "true").csv('201508_trip_data.csv')

In [5]:
# Checking the sample of data
station_df.show()

+----------+--------------------+---------+-----------+---------+------------+------------+
|station_id|                name|      lat|       long|dockcount|    landmark|installation|
+----------+--------------------+---------+-----------+---------+------------+------------+
|         2|San Jose Diridon ...|37.329732|-121.901782|       27|    San Jose|    8/6/2013|
|         3|San Jose Civic Ce...|37.330698|-121.888979|       15|    San Jose|    8/5/2013|
|         4|Santa Clara at Al...|37.333988|-121.894902|       11|    San Jose|    8/6/2013|
|         5|    Adobe on Almaden|37.331415|  -121.8932|       19|    San Jose|    8/5/2013|
|         6|    San Pedro Square|37.336721|-121.894074|       15|    San Jose|    8/7/2013|
|         7|Paseo de San Antonio|37.333798|-121.886943|       15|    San Jose|    8/7/2013|
|         8| San Salvador at 1st|37.330165|-121.885831|       15|    San Jose|    8/5/2013|
|         9|           Japantown|37.348742|-121.894715|       15|    San Jose|  

In [6]:
# Checking the sample of data
trips_df.show()

+-------+--------+---------------+--------------------+--------------+---------------+--------------------+------------+------+---------------+--------+
|Trip ID|Duration|     Start Date|       Start Station|Start Terminal|       End Date|         End Station|End Terminal|Bike #|Subscriber Type|Zip Code|
+-------+--------+---------------+--------------------+--------------+---------------+--------------------+------------+------+---------------+--------+
| 913460|     765|8/31/2015 23:26|Harry Bridges Pla...|            50|8/31/2015 23:39|San Francisco Cal...|          70|   288|     Subscriber|    2139|
| 913459|    1036|8/31/2015 23:11|San Antonio Shopp...|            31|8/31/2015 23:28|Mountain View Cit...|          27|    35|     Subscriber|   95032|
| 913455|     307|8/31/2015 23:13|      Post at Kearny|            47|8/31/2015 23:18|   2nd at South Park|          64|   468|     Subscriber|   94107|
| 913454|     409|8/31/2015 23:10|  San Jose City Hall|            10|8/31/2015 23

In [7]:
# Vertices and Edges
vertices = station_df.withColumnRenamed("name","id").distinct()
edges = trips_df.withColumnRenamed("Start Station","src").withColumnRenamed("End Station","dst")

In [8]:
# Checking vertices
vertices.show()

+----------+--------------------+---------+-----------+---------+-------------+------------+
|station_id|                  id|      lat|       long|dockcount|     landmark|installation|
+----------+--------------------+---------+-----------+---------+-------------+------------+
|        51|Embarcadero at Fo...|37.791464|-122.391034|       19|San Francisco|   8/20/2013|
|        58|San Francisco Cit...| 37.77865|-122.418235|       19|San Francisco|   8/21/2013|
|        60|Embarcadero at Sa...| 37.80477|-122.403234|       15|San Francisco|   8/21/2013|
|        65|     Townsend at 7th|37.771058|-122.402717|       15|San Francisco|   8/22/2013|
|        63|       Howard at 2nd|37.786978|-122.398108|       19|San Francisco|   8/22/2013|
|        33|Rengstorff Avenue...|37.400241|-122.099076|       15|Mountain View|   8/16/2013|
|        25|Stanford in Redwo...| 37.48537|-122.203288|       15| Redwood City|   8/12/2013|
|        71|Powell at Post (U...|37.788446|-122.408499|       19|San F

In [9]:
# Checking edges
edges.show()

+-------+--------+---------------+--------------------+--------------+---------------+--------------------+------------+------+---------------+--------+
|Trip ID|Duration|     Start Date|                 src|Start Terminal|       End Date|                 dst|End Terminal|Bike #|Subscriber Type|Zip Code|
+-------+--------+---------------+--------------------+--------------+---------------+--------------------+------------+------+---------------+--------+
| 913460|     765|8/31/2015 23:26|Harry Bridges Pla...|            50|8/31/2015 23:39|San Francisco Cal...|          70|   288|     Subscriber|    2139|
| 913459|    1036|8/31/2015 23:11|San Antonio Shopp...|            31|8/31/2015 23:28|Mountain View Cit...|          27|    35|     Subscriber|   95032|
| 913455|     307|8/31/2015 23:13|      Post at Kearny|            47|8/31/2015 23:18|   2nd at South Park|          64|   468|     Subscriber|   94107|
| 913454|     409|8/31/2015 23:10|  San Jose City Hall|            10|8/31/2015 23

# Graph Creation

In [10]:
# Graph Creation
graph = GraphFrame(vertices,edges)

In [11]:
# CHecking the graph created
graph

GraphFrame(v:[id: string, station_id: string ... 5 more fields], e:[src: string, dst: string ... 9 more fields])

# Triangle Count

It computes the number of triangles passing through each vertex.

Returns dataFrame with new vertex column "count"

In [12]:
triangleCount = graph.triangleCount()

In [13]:
triangleCount.select("id", "count").show()

+--------------------+-----+
|                  id|count|
+--------------------+-----+
|       2nd at Folsom|  496|
|California Ave Ca...|   23|
|Washington at Kea...|    0|
|Powell at Post (U...|  496|
| Golden Gate at Polk|  496|
|Yerba Buena Cente...|  496|
|   Market at Sansome|  496|
|         MLK Library|   90|
|     Spear at Folsom|  496|
|           Japantown|   77|
|Commercial at Mon...|  496|
|Paseo de San Antonio|   81|
|Rengstorff Avenue...|   23|
| San Salvador at 1st|   61|
|     Townsend at 7th|  496|
|Civic Center BART...|  496|
|         Ryland Park|   41|
|San Jose Diridon ...|   90|
|San Jose Civic Ce...|   63|
|     Post at Kearney|    0|
+--------------------+-----+
only showing top 20 rows



# Shortest Path w.r.t Landmark

It gives the shortest paths from each vertex to the given set of landmark vertices, where landmarks are specified by vertex ID.

It returns a dataFrame with new vertices column “distances”

In [14]:
shortestPath = graph.shortestPaths(landmarks=["San Jose City Hall", "St James Park"])

In [16]:
shortestPath.select("id","distances").show()

+--------------------+--------------------+
|                  id|           distances|
+--------------------+--------------------+
|         MLK Library|[St James Park ->...|
|Santa Clara Count...|[St James Park ->...|
|   2nd at South Park|                  []|
|California Ave Ca...|                  []|
|       2nd at Folsom|                  []|
|    Adobe on Almaden|[San Jose City Ha...|
|Mechanics Plaza (...|                  []|
|       Howard at 2nd|                  []|
|          Mezes Park|                  []|
|    Davis at Jackson|                  []|
|Broadway St at Ba...|                  []|
|       Park at Olive|                  []|
|     Beale at Market|                  []|
|Embarcadero at Br...|                  []|
|   Franklin at Maple|                  []|
|Redwood City Calt...|                  []|
|Civic Center BART...|                  []|
|San Antonio Shopp...|                  []|
|       St James Park|[St James Park ->...|
|      Market at 10th|          

# Page Rank

It works by counting the number and quality of links to a vertex to determine a rough estimate of how important is the vertex node.

It accepts parameters like
    resetProbability - indicates the probability of resetting to a random vertex.
    Source Id - the source vertex for a personalized PageRank.
    maxIter – If set, the algorithm is run for a fixed number of iterations. This may not be set if the tol parameter is set.
    tol – If set, the algorithm is run until the given tolerance. This may not be set if the numIter parameter is set.
    
 Returns the GraphFrame with new vertices column "pagerank" and new edges column "weight"


In [17]:
pageRank = graph.pageRank(resetProbability=0.15, tol=0.01)

In [18]:
pageRank.vertices.select("id", "pagerank").show()
pageRank.edges.select("src", "dst", "weight").show()

+--------------------+------------------+
|                  id|          pagerank|
+--------------------+------------------+
|         MLK Library|0.7773824929442228|
|Santa Clara Count...|0.4980209797772844|
|   2nd at South Park| 0.655950281002495|
|California Ave Ca...|0.9108234167101544|
|       2nd at Folsom|0.6019976457886538|
|    Adobe on Almaden|0.5363219791463285|
|Mechanics Plaza (...|0.7253474780348989|
|       Howard at 2nd|0.8054001544389847|
|          Mezes Park|0.2624675157029054|
|    Davis at Jackson|0.7741963042010733|
|Broadway St at Ba...|0.6542773822104648|
|       Park at Olive|0.7002431931818809|
|     Beale at Market|0.7386610420274133|
|Embarcadero at Br...|0.7764332479171439|
|   Franklin at Maple|0.4924961201086168|
|Redwood City Calt...|1.5344637511069898|
|Civic Center BART...|0.7328209397912441|
|San Antonio Shopp...|0.8947155902287536|
|       St James Park|0.7637060969041818|
|      Market at 10th|1.0527872000946072|
+--------------------+------------

# Saving the Graphs

In [20]:
pageRank.vertices.coalesce(1).write.csv("vertices")
pageRank.edges.coalesce(1).write.csv("edges")

# Label Propagation

LPA is a standard community detection algorithm for graphs. I runs static Label Propagation Algorithm for detecting communities in networks.

In [21]:
lpaResult = graph.labelPropagation(maxIter=5)

In [22]:
lpaResult.select("id", "label").show()

+--------------------+-------------+
|                  id|        label|
+--------------------+-------------+
|         MLK Library| 292057776128|
|Santa Clara Count...|1365799600128|
|   2nd at South Park|1657857376256|
|California Ave Ca...| 429496729600|
|       2nd at Folsom|1657857376256|
|    Adobe on Almaden| 292057776128|
|Mechanics Plaza (...|1657857376256|
|       Howard at 2nd|1657857376256|
|          Mezes Park| 730144440320|
|    Davis at Jackson|1657857376256|
|Broadway St at Ba...|1657857376256|
|       Park at Olive| 429496729600|
|     Beale at Market|1657857376256|
|Embarcadero at Br...|1657857376256|
|   Franklin at Maple| 730144440320|
|Redwood City Calt...|1151051235328|
|Civic Center BART...|1657857376256|
|San Antonio Shopp...| 661424963584|
|       St James Park| 292057776128|
|      Market at 10th|1657857376256|
+--------------------+-------------+
only showing top 20 rows



# Breadth-first search (BFS)

Breadth-first search (BFS) finds the shortest path(s) from one vertex (or a set of vertices) to another vertex (or a set of vertices). The beginning and end vertices are specified as Spark DataFrame expressions

In [23]:
# Search from "St Jame Park" for stations with dock count < 15.
paths = graph.bfs("id = 'St James Park'", "dockcount < 15")

In [24]:
paths.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                from|                  e0|                  v1|                  e1|                  to|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|[13, St James Par...|[910027, 209, 8/2...|[11, MLK Library,...|[899253, 1174, 8/...|[4, Santa Clara a...|
|[13, St James Par...|[910027, 209, 8/2...|[11, MLK Library,...|[899252, 1186, 8/...|[4, Santa Clara a...|
|[13, St James Par...|[910027, 209, 8/2...|[11, MLK Library,...|[899251, 1170, 8/...|[4, Santa Clara a...|
|[13, St James Par...|[910027, 209, 8/2...|[11, MLK Library,...|[859900, 624, 7/2...|[4, Santa Clara a...|
|[13, St James Par...|[902538, 268, 8/2...|[11, MLK Library,...|[899253, 1174, 8/...|[4, Santa Clara a...|
|[13, St James Par...|[902538, 268, 8/2...|[11, MLK Library,...|[899252, 1186, 8/...|[4, Santa Clara a...|
|[13, St James Par...|[902538, 268, 8