In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, row_number, abs, count
from pyspark.sql import Window, types

spark_session = SparkSession.builder.enableHiveSupport().master("local").getOrCreate()

graph_path = "/data/sample264"

In [None]:
def norm(df, key1, key2, field, n): 
    
    window = Window.partitionBy(key1).orderBy(col(field).desc())
        
    tops_df = df.withColumn('row_number', row_number().over(window))\
        .filter(col('row_number') <= n)\
        .drop(col('row_number')) 
        
    tmp_df = tops_df.groupBy(col(key1)).agg(col(key1), sum(col(field)).alias('sum_' + field))
   
    normalized_df = tops_df.join(tmp_df, key1, 'inner')\
        .withColumn('norm_' + field, col(field) / col('sum_' + field))\
        .cache()

    return normalized_df

In [None]:
graph = spark_session.read.parquet(graph_path)
copy  = spark_session.read.parquet(graph_path)

unnorm = graph\
    .join(copy, graph.userId == copy.userId, 'inner')\
    .where(graph.trackId != copy.trackId)\
    .where(abs(graph.timestamp - copy.timestamp) < 420)\
    .select(graph.trackId.alias('track'), copy.trackId.alias('similar'))\
    .groupBy(col('track'), col('similar'))\
    .count()
    
norm = norm(unnorm, 'track', 'similar', 'count', 40)\
    .select(col('track'), col('similar'))\
    .orderBy(col('norm_count').desc(), col('track'), col('similar'))\
    .limit(40)
    
result = norm.collect()

for track, similar in result:
    print('%s\t%s' % (track, similar))

798256 923706
798319 837992
798322 876562
798331 827364
798335 840741
798374 816874
798375 810685
798379 812055
798380 840113
798396 817687
798398 926302
798405 867217
798443 905923
798457 918918
798460 891840
798461 940379
798470 840814
798474 963162
798477 883244
798485 955521
798505 905671
798550 936295
798626 845438
798691 818279
798692 898823
798702 811440
798704 937570
798725 933147
798738 894170
798745 799665
798782 956938
798801 950802
798820 890393
798833 916319
798865 962662
798931 893574
798946 946408
799012 809997
799024 935246
799047 905199
