In [1]:
import os
execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
      /_/

Using Python version 2.7.12 (default, Nov 19 2016 06:48:10)
SparkSession available as 'spark'.


In [2]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local [2]").getOrCreate()

## Triangle Count
![](./img/triangleCount.jpg)

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql import Row

In [4]:
edgeList = [(1,2), (1,3), (1,4), (2,3), (2,5), (3,4), (3,5), (3,6), (3,7)]
graphData = sparkSession.sparkContext \
    .parallelize(edgeList) \
    .map(lambda(src, dst): Row(src, dst))

In [5]:
graphSchemaAB = StructType([
    StructField('A', IntegerType(), nullable=False),
    StructField('B', StringType(), nullable=False)
])

ab = sparkSession.createDataFrame(graphData, graphSchemaAB)
ab.show()

+---+---+
|  A|  B|
+---+---+
|  1|  2|
|  1|  3|
|  1|  4|
|  2|  3|
|  2|  5|
|  3|  4|
|  3|  5|
|  3|  6|
|  3|  7|
+---+---+



In [6]:
graphSchemaBC1 = StructType([
    StructField('B', IntegerType(), nullable=False),
    StructField('C1', StringType(), nullable=False)
])

bc1 = sparkSession.createDataFrame(graphData, graphSchemaBC1)

In [7]:
graphSchemaAC2 = StructType([
    StructField('A', IntegerType(), nullable=False),
    StructField('C2', StringType(), nullable=False)
])

ac2 = sparkSession.createDataFrame(graphData, graphSchemaAC2)

In [8]:
abc1 = ab.join(bc1, 'B')
abc1.show()

+---+---+---+
|  B|  A| C1|
+---+---+---+
|  3|  1|  4|
|  3|  1|  5|
|  3|  1|  6|
|  3|  1|  7|
|  3|  2|  4|
|  3|  2|  5|
|  3|  2|  6|
|  3|  2|  7|
|  2|  1|  3|
|  2|  1|  5|
+---+---+---+



In [9]:
abc1c2 = abc1.join(ac2, 'A')
abc1c2.show()

+---+---+---+---+
|  A|  B| C1| C2|
+---+---+---+---+
|  1|  3|  4|  2|
|  1|  3|  4|  3|
|  1|  3|  4|  4|
|  1|  3|  5|  2|
|  1|  3|  5|  3|
|  1|  3|  5|  4|
|  1|  3|  6|  2|
|  1|  3|  6|  3|
|  1|  3|  6|  4|
|  1|  3|  7|  2|
|  1|  3|  7|  3|
|  1|  3|  7|  4|
|  1|  2|  3|  2|
|  1|  2|  3|  3|
|  1|  2|  3|  4|
|  1|  2|  5|  2|
|  1|  2|  5|  3|
|  1|  2|  5|  4|
|  2|  3|  4|  3|
|  2|  3|  4|  5|
+---+---+---+---+
only showing top 20 rows



In [10]:
from pyspark.sql.functions import array, col, explode

vertexTriangle = abc1c2 \
    .filter("C1 = C2") \
    .select(
        array(col('A'), col('B'), col('C1')).alias('triangleVertices')) \
    .select(
        explode('triangleVertices').alias('triangleVertex')) \
    .groupBy('triangleVertex') \
    .count() \
    .orderBy('triangleVertex').show()

+--------------+-----+
|triangleVertex|count|
+--------------+-----+
|             1|    2|
|             2|    2|
|             3|    3|
|             4|    1|
|             5|    1|
+--------------+-----+



## Triangle Count method 2

In [11]:
edgeListBC = sparkSession.sparkContext.broadcast(set(edgeList))

# define udf
from pyspark.sql.functions import udf
def isInEdgeList(src, dst):
    return (int(src), int(dst)) in edgeListBC.value

udf_isInEdgeList = udf(isInEdgeList, StringType())

In [12]:
abc1.withColumn('isTriangle', udf_isInEdgeList('A', 'C1')).show()

+---+---+---+----------+
|  B|  A| C1|isTriangle|
+---+---+---+----------+
|  3|  1|  4|      true|
|  3|  1|  5|     false|
|  3|  1|  6|     false|
|  3|  1|  7|     false|
|  3|  2|  4|     false|
|  3|  2|  5|      true|
|  3|  2|  6|     false|
|  3|  2|  7|     false|
|  2|  1|  3|      true|
|  2|  1|  5|     false|
+---+---+---+----------+

