In [1]:
import os
exec(open(os.path.join(os.environ['SPARK_HOME'], 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.0.cloudera2
      /_/

Using Python version 3.4.3 (default, Nov 17 2016 01:08:31)
SparkSession available as 'spark'.


In [2]:
spark.conf.set("spark.sql.shuffle.partitions", 10000)

In [3]:
dataPath = "./"
graphPath = dataPath + "trainGraph/"
usersToPredictPath = dataPath + "prediction.csv"

In [4]:
spark.read.format("csv").option("delimiter", "\t")\
    .load(graphPath).withColumnRenamed("_c0", "user").withColumnRenamed("_c1", "friendsString")\
    .repartition(50)\
    .write.mode("overwrite")\
    .parquet(dataPath + "trainGraphRepartitioned")

In [11]:
from pyspark.sql.functions import abs, col, explode, collect_list, sort_array, size, split, concat_ws, lit
from pyspark.sql.types import *

In [12]:
from pyspark.sql.functions import udf

def cutStartEndBrackets(s):
    return s[2:-2]

cutStartEndBracketsUDF = udf(cutStartEndBrackets, StringType())

In [13]:
data = spark.read.parquet(dataPath + "trainGraphRepartitioned") 

userFriend = data.select(col("user"), split(cutStartEndBracketsUDF(col("friendsString")), "\),\(").alias("friendsMasks"))\
    .withColumn("friendMask", explode('friendsMasks'))\
    .withColumn("friend", split(col("friendMask"), ",")[0])\
    .select(col("user").cast("integer"), col("friend").cast("integer"))

usersWithCommonFriend = userFriend\
    .groupBy("friend")\
    .agg(collect_list("user").alias("usersWithCommonFriend"))\
    .where(size(col("usersWithCommonFriend")) >= 2) \
    .select(col("usersWithCommonFriend"))

In [14]:
from pyspark.sql.functions import udf

def pairsWithCommonFriend(usersWithCommonFriend):
    pairs = []
    for user1Index in range(0, len(usersWithCommonFriend)):
         for user2Index in range(0, len(usersWithCommonFriend)):
                if user1Index != user2Index:
                    pairs.append((usersWithCommonFriend[user1Index], usersWithCommonFriend[user2Index]))
    return pairs

schema = ArrayType(StructType([
    StructField("user1", IntegerType(), False),
    StructField("user2", IntegerType(), False)
]))
         
pairsWithCommonFriendUdf = udf(pairsWithCommonFriend, schema)

pairsPath = dataPath + "pairs"

commonFriendsCounts = usersWithCommonFriend\
        .select(pairsWithCommonFriendUdf("usersWithCommonFriend").alias("pairsWithCommonFriend"))\
        .where(size(col("pairsWithCommonFriend")) > 0)\
        .write.mode("overwrite").parquet(pairsPath)

In [19]:
spark.read.parquet(pairsPath)\
    .withColumn("pairWithCommonFriend", explode("pairsWithCommonFriend"))\
    .drop(col("pairsWithCommonFriend"))\
    .groupBy(col("pairWithCommonFriend"))\
    .count()\
    .write.parquet(dataPath + "pairsCount")
