In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import collect_list, udf, col, max, size
from pyspark.ml.feature import CountVectorizer, MinHashLSH

In [5]:
def main(spark):

    '''1. Preprocessing Data '''
    # Load the ratings.csv into DataFrame
    ratings_df = spark.read.csv(f'/home/hl5679/capstone-project-cap-19/ml-latest/ratings.csv', schema='userId INT, movieId STRING, rating FLOAT, timestamp BIGINT')

#     ratings_df.cache() #Cache for optimizing
    
    # Group by userId and collect all movieIds into a list
    ratings_df_grouped = ratings_df.groupBy("userId").agg(collect_list("movieId").alias("movieIds")).cache()
    # ratings_df_grouped = ratings_df_grouped.repartition("userId")
#     ratings_df_grouped.cache() #Cache for optimizing
#     ratings_df_grouped.show()
    
    ratings_df_filtered = ratings_df_grouped.filter(size("movieIds") >= 5)
    
    # Vectorize moviIds
    cv = CountVectorizer(inputCol = 'movieIds', outputCol = 'features')
    model = cv.fit(ratings_df_grouped)
    ratings_df_final = model.transform(ratings_df_filtered)
    # ratings_df_final.show()
    

    ''' 2. Applying MinHash '''
    mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
    model = mh.fit(ratings_df_final)

    print("Transformed Data\n")
    transformed_df = model.transform(ratings_df_final)
    similar_pairs = model.approxSimilarityJoin(transformed_df, transformed_df, 0.6, distCol="JaccardDistance")
    # similar_pairs = similar_pairs.filter("datasetA.userId < datasetB.userId")
#     similar_pairs.show()
    

    print("100 similarity pairs\n")
    similar_pairs = similar_pairs.filter("datasetA.userId < datasetB.userId").orderBy("JaccardDistance", ascending=True).limit(100)
    # top_100_pairs.select("datasetA.userId", "datasetB.userId", "JaccardDistance").show(100)
    # top_100_pairs.printSchema()

    print("Simplified df\n")
    simplified_df = similar_pairs.select(
        col("datasetA.userId").alias("userIdA"),
        col("datasetB.userId").alias("userIdB"),
        "JaccardDistance"
    )

    print("Write simplified df to Parquet\n")
    # Write the simplified DataFrame to parquet
    simplified_df.write.parquet('top100pairs_all')

In [6]:
# Only enter this block if we're in main
if __name__ == "__main__":

    # Create the spark session object
#     spark = SparkSession.builder.appName('minHash').getOrCreate()
    
#     spark = SparkSession.builder \
#     .appName("Spark Application") \
#     .config("spark.executor.memory", "16g") \
#     .config("spark.driver.memory", "16g") \
#     .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
#     .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
#     .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
#     .getOrCreate()

    spark = SparkSession.builder \
                        .appName('minHash') \
                        .getOrCreate()

    # Call our main routine
    main(spark)

24/05/10 17:17:47 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_7_163 in memory.
24/05/10 17:17:47 WARN MemoryStore: Not enough space to cache rdd_7_163 in memory! (computed 384.0 B so far)
24/05/10 17:17:47 WARN BlockManager: Persisting block rdd_7_163 to disk instead.
24/05/10 17:17:47 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_7_165 in memory.
24/05/10 17:17:47 WARN MemoryStore: Not enough space to cache rdd_7_165 in memory! (computed 384.0 B so far)
24/05/10 17:17:47 WARN BlockManager: Persisting block rdd_7_165 to disk instead.
24/05/10 17:17:48 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_7_167 in memory.
24/05/10 17:17:48 WARN MemoryStore: Not enough space to cache rdd_7_167 in memory! (computed 384.0 B so far)
24/05/10 17:17:48 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing bl

Transformed Data

100 similarity pairs

Simplified df

Write simplified df to Parquet



24/05/10 17:18:00 WARN MemoryStore: Not enough space to cache rdd_7_10 in memory! (computed 2.7 MiB so far)
24/05/10 17:18:00 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_7_11 in memory.
24/05/10 17:18:00 WARN MemoryStore: Not enough space to cache rdd_7_11 in memory! (computed 384.0 B so far)
24/05/10 17:18:00 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_7_12 in memory.
24/05/10 17:18:00 WARN MemoryStore: Not enough space to cache rdd_7_12 in memory! (computed 384.0 B so far)
24/05/10 17:18:01 WARN MemoryStore: Not enough space to cache rdd_7_13 in memory! (computed 2.5 MiB so far)
24/05/10 17:18:01 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_7_14 in memory.
24/05/10 17:18:01 WARN MemoryStore: Not enough space to cache rdd_7_14 in memory! (computed 384.0 B so far)
24/05/10 17:18:01 WARN MemoryStore: Failed to reserve initial

24/05/10 17:18:55 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_7_15 in memory.
24/05/10 17:18:55 WARN MemoryStore: Not enough space to cache rdd_7_15 in memory! (computed 384.0 B so far)
24/05/10 17:18:55 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_7_16 in memory.
24/05/10 17:18:55 WARN MemoryStore: Not enough space to cache rdd_7_16 in memory! (computed 384.0 B so far)
24/05/10 17:18:55 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_7_17 in memory.
24/05/10 17:18:55 WARN MemoryStore: Not enough space to cache rdd_7_17 in memory! (computed 384.0 B so far)
24/05/10 17:18:55 WARN MemoryStore: Not enough space to cache rdd_7_18 in memory! (computed 2.9 MiB so far)
24/05/10 17:18:56 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_7_19 in memory.
24/05/10 17:18:56 WARN MemoryStore: 

In [None]:
'''
+--------------------+--------------------+------------------+
|            datasetA|            datasetB|   JaccardDistance|
+--------------------+--------------------+------------------+
|{159361, [50, 318...|{6820, [318, 4995...|0.5555555555555556|
|{112094, [1196, 1...|{112094, [1196, 1...|               0.0|
|{254602, [48, 150...|{254602, [48, 150...|               0.0|
|{257335, [260, 31...|{257335, [260, 31...|               0.0|
|{288760, [1, 47, ...|{288760, [1, 47, ...|               0.0|
|{214820, [2028, 2...|{214820, [2028, 2...|               0.0|
|{77749, [223, 296...|{77749, [223, 296...|               0.0|
|{26650, [318, 527...|{85346, [260, 527...|0.5151515151515151|
|{131756, [3578, 3...|{214189, [260, 52...|0.5227272727272727|
|{185669, [318, 35...|{185669, [318, 35...|               0.0|
|{278829, [10, 18,...|{278829, [10, 18,...|               0.0|
|{220550, [260, 29...|{220550, [260, 29...|               0.0|
|{137155, [19, 165...|{137155, [19, 165...|               0.0|
|{93424, [260, 356...|{93424, [260, 356...|               0.0|
|{131505, [260, 29...|{131505, [260, 29...|               0.0|
|{271048, [6, 50, ...|{271048, [6, 50, ...|               0.0|
|{200191, [47, 185...|{200191, [47, 185...|               0.0|
|{10131, [44, 70, ...|{10131, [44, 70, ...|               0.0|
|{276465, [277, 34...|{276465, [277, 34...|               0.0|
|{98347, [2302], (...|{319979, [1639, 2...|               0.5|
+--------------------+--------------------+------------------+
only showing top 20 rows
'''

In [1]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("Read and Process Parquet Files") \
    .getOrCreate()

# Read Parquet files from a specified path or multiple paths
df = spark.read.parquet("top100pairs_all")


# Order by a specific column and limit to 100 records
result_df = df.orderBy("JaccardDistance", ascending=True).limit(100)

# Show the results
result_df.write.csv('q1_results', header=True)

# Stop the Spark session
spark.stop()


24/05/10 19:26:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
