## Q1

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import collect_list, udf, col, max, size
from pyspark.ml.feature import CountVectorizer, MinHashLSH

In [None]:
def main(spark):

    '''1. Preprocessing Data '''
    # Load the ratings.csv into DataFrame
    ratings_df = spark.read.csv(f'/home/hl5679/capstone-project-cap-19/ml-latest/ratings.csv', schema='userId INT, movieId STRING, rating FLOAT, timestamp BIGINT')
    ratings_df = ratings_df.repartition(256, "timestamp")
#     ratings_df.cache() #Cache for optimizing
    
    # Group by userId and collect all movieIds into a list
    ratings_df_grouped = ratings_df.groupBy("userId").agg(collect_list("movieId").alias("movieIds")).cache()
    # ratings_df_grouped = ratings_df_grouped.repartition("userId")
#     ratings_df_grouped.cache() #Cache for optimizing
#     ratings_df_grouped.show()
    
    ratings_df_filtered = ratings_df_grouped.filter(size("movieIds") >= 5)
    
    # Vectorize moviIds
    cv = CountVectorizer(inputCol = 'movieIds', outputCol = 'features')
    model = cv.fit(ratings_df_filtered)
    ratings_df_final = model.transform(ratings_df_filtered)
    # ratings_df_final.show()
    

    ''' 2. Applying MinHash '''
    mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
    model = mh.fit(ratings_df_final)

    print("Transformed Data\n")
    transformed_df = model.transform(ratings_df_final)
#     transformed_df.show()
    similar_pairs = model.approxSimilarityJoin(transformed_df, transformed_df, 0.6, distCol="JaccardDistance")
    # similar_pairs = similar_pairs.filter("datasetA.userId < datasetB.userId")
#     similar_pairs.show()
    

    print("100 similarity pairs\n")
#     similar_pairs = similar_pairs.filter("JaccardDistance == 0").filter("datasetA.userId < datasetB.userId")
    similar_pairs = similar_pairs.filter("datasetA.userId < datasetB.userId").orderBy("JaccardDistance", ascending=True).limit(100)
    # top_100_pairs.select("datasetA.userId", "datasetB.userId", "JaccardDistance").show(100)
    # top_100_pairs.printSchema()
    
    
    print("Simplified df\n")
    simplified_df = similar_pairs.select(
        col("datasetA.userId").alias("userIdA"),
        col("datasetB.userId").alias("userIdB"),
        "JaccardDistance"
    )

    print("Write simplified df to Parquet\n")
    # Write the simplified DataFrame to parquet
#     simplified_df.show()
    simplified_df.write.parquet('top100pairs_all')

In [None]:
# Only enter this block if we're in main
if __name__ == "__main__":

    # Create the spark session object
    
    spark = SparkSession.builder \
    .appName("minHash") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory", "16g") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
    .getOrCreate()

#     spark = SparkSession.builder \
#                         .appName('minHash') \
#                         .getOrCreate()

    # Call our main routine
    main(spark)

In [3]:
'''convert parquet to csv and store result'''
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("Read and Process Parquet Files") \
    .getOrCreate()

# Read Parquet files from a specified path or multiple paths
df = spark.read.parquet("top100pairs_all")


# Order by a specific column and limit to 100 records
result_df = df.orderBy("userIdA", "userIdB", ascending=True).limit(100)

# Show the results
result_df.write.csv('q1_results', header=True)

# Stop the Spark session
spark.stop()


### Q1 test output

In [None]:
'''
+--------------------+--------------------+------------------+
|            datasetA|            datasetB|   JaccardDistance|
+--------------------+--------------------+------------------+
|{159361, [50, 318...|{6820, [318, 4995...|0.5555555555555556|
|{112094, [1196, 1...|{112094, [1196, 1...|               0.0|
|{254602, [48, 150...|{254602, [48, 150...|               0.0|
|{257335, [260, 31...|{257335, [260, 31...|               0.0|
|{288760, [1, 47, ...|{288760, [1, 47, ...|               0.0|
|{214820, [2028, 2...|{214820, [2028, 2...|               0.0|
|{77749, [223, 296...|{77749, [223, 296...|               0.0|
|{26650, [318, 527...|{85346, [260, 527...|0.5151515151515151|
|{131756, [3578, 3...|{214189, [260, 52...|0.5227272727272727|
|{185669, [318, 35...|{185669, [318, 35...|               0.0|
|{278829, [10, 18,...|{278829, [10, 18,...|               0.0|
|{220550, [260, 29...|{220550, [260, 29...|               0.0|
|{137155, [19, 165...|{137155, [19, 165...|               0.0|
|{93424, [260, 356...|{93424, [260, 356...|               0.0|
|{131505, [260, 29...|{131505, [260, 29...|               0.0|
|{271048, [6, 50, ...|{271048, [6, 50, ...|               0.0|
|{200191, [47, 185...|{200191, [47, 185...|               0.0|
|{10131, [44, 70, ...|{10131, [44, 70, ...|               0.0|
|{276465, [277, 34...|{276465, [277, 34...|               0.0|
|{98347, [2302], (...|{319979, [1639, 2...|               0.5|
+--------------------+--------------------+------------------+
only showing top 20 rows
'''

## Q2

### Q2 test output (small)

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import DenseMatrix, Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import col, concat_ws

import numpy as np

In [73]:
'''small top 100 correlation'''

def main(spark):

    # 1. Read & Transform the data
    similarity_df = spark.read.csv(f'/home/hl5679/capstone-project-cap-19/small_top100_csv/small.csv', schema='userIdA INT, userIdB INT, JaccardDistance FLOAT')
    # similarity_df = spark.read.csv(f'/home/hl5679/capstone-project-cap-19/q1_all_atleast5movies_top100/q1_results.csv', schema='userIdA INT, userIdB INT, JaccardDistance FLOAT')
    similarity_df = similarity_df.drop('JaccardDistance')
    sim_user_df = (similarity_df.select("userIdA")
            .union(similarity_df.select("userIdB"))
            .distinct()
            .filter("userIdA is not null"))

#     sim_user_df.show(50) # 50 unique userids
#     print(sim_user_ids.count())
    

    ratings_df = spark.read.csv(f'/home/hl5679/capstone-project-cap-19/ml-latest-small/ratings.csv', schema='userId INT, movieId STRING, rating FLOAT, timestamp BIGINT')
    # ratings_df = spark.read.csv(f'/home/hl5679/capstone-project-cap-19/ml-latest/ratings.csv', schema='userId INT, movieId STRING, rating FLOAT, timestamp BIGINT')
    ratings_df = ratings_df.drop('timestamp')
    
    sim_filtered_ratings_df = ratings_df.join(sim_user_df, (ratings_df.userId == sim_user_df.userIdA), "inner").drop('userIdA').cache()
#     sim_filtered_ratings_df.select("userId").distinct().show(50) #checked all unique user's ratings are here
#     sim_filtered_ratings_df.show(50)

    common_ratings_df = sim_filtered_ratings_df.alias("r1") \
    .join(sim_filtered_ratings_df.alias("r2"),
          (col("r1.movieId") == col("r2.movieId")) &
          (col("r1.userId") != col("r2.userId"))) \
    .join(similarity_df, (col("r1.userId") == col("userIdA")) & (col("r2.userId") == col("userIdB"))) \
    .select(col("userIdA"), col("userIdB"), col("r1.movieId").alias("movie1"),col("r2.movieId").alias("movie2"),col("r1.rating").alias("rating1"), col("r2.rating").alias("rating2"))
    
    # Assemble ratings into vectors for correlation calculation
    vector_assembler = VectorAssembler(inputCols=["rating1", "rating2"], outputCol="features")
    vector_ratings_df = vector_assembler.transform(common_ratings_df)
#     vector_ratings_df.filter("userIdA == 8 and userIdB == 446").show(50) #42, verified ok
#     vector_ratings_df = vector_ratings_df.filter("userIdA == 8 and userIdB == 446")
    vector_ratings_df = vector_ratings_df.drop("movie1","movie2","rating1","rating2")
#     vector_ratings_df.show()


    # Calculate correlation
#     correlations = []
    correlations = {}
    for row in similarity_df.collect():
        userA, userB = row.userIdA, row.userIdB
        pair_data = vector_ratings_df.filter((col("userIdA") == userA) & (col("userIdB") == userB))
        if not pair_data.rdd.isEmpty():
            corr_matrix = Correlation.corr(pair_data, "features", "pearson").head()[0]
            if corr_matrix is not None:
                corr_value = corr_matrix[0, 1]  # Accessing the off-diagonal element for the correlation between two features
                correlations[(userA, userB)] = corr_value
#             correlations.append(corr)
    
    print(correlations)
    average_corr = np.mean(list(correlations.values()))
    print(f"Average Pearson Correlation: {average_corr}")

In [75]:
# Only enter this block if we're in main
if __name__ == "__main__":

    spark = SparkSession.builder \
    .appName("correlation") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory", "16g") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.broadcastTimeout", "7200") \
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
    .getOrCreate()
    
    # Call our main routine
    main(spark)

24/05/11 10:53:59 WARN CacheManager: Asked to cache already cached data.


+-------+-------+---------+
|userIdA|userIdB| features|
+-------+-------+---------+
|      8|    446|[4.0,3.0]|
|      8|    347|[4.0,3.0]|
|      8|     94|[4.0,4.0]|
|      8|    446|[2.0,3.0]|
|      8|    347|[2.0,4.0]|
|      8|     94|[2.0,3.0]|
|      8|    446|[4.0,4.0]|
|      8|    347|[4.0,4.0]|
|      8|     94|[4.0,3.0]|
|      8|    446|[4.0,3.0]|
|      8|    379|[4.0,2.0]|
|      8|     94|[4.0,3.0]|
|      8|    446|[3.0,4.0]|
|      8|    347|[3.0,3.0]|
|      8|     94|[3.0,5.0]|
|      8|    446|[5.0,5.0]|
|      8|    379|[5.0,5.0]|
|      8|    347|[5.0,4.0]|
|      8|     94|[5.0,4.0]|
|      8|    446|[3.0,3.0]|
+-------+-------+---------+
only showing top 20 rows

{(130, 145): -0.2966896119691211, (130, 574): 0.18051650034429462, (130, 468): 0.02099802627829038, (126, 379): 0.22243612372734356, (242, 468): 0.15144775094307247, (150, 270): 0.3138439323974279, (126, 130): -0.05551876851617387, (94, 347): -0.04621630369226584, (81, 126): 0.11490421517543613, (46, 

### Q2 - order top 100 correlation

In [8]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import DenseMatrix, Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import col, concat_ws, rand

import numpy as np

'''all ordered top 100 correlation'''

def main(spark):


    
    # 1. Read & Transform the data
    similarity_df = spark.read.csv(f'/home/hl5679/capstone-project-cap-19/q1_all_atleast5movies_allJD0/q1_allJD0_ordered100.csv', schema='userIdA INT, userIdB INT, JaccardDistance FLOAT')
    similarity_df = similarity_df.drop('JaccardDistance')
    sim_user_df = (similarity_df.select("userIdA")
            .union(similarity_df.select("userIdB"))
            .distinct()
            .filter("userIdA is not null"))

#     sim_user_df.show(50) # 50 unique userids
#     print(sim_user_ids.count())
    

    ratings_df = spark.read.csv(f'/home/hl5679/capstone-project-cap-19/ml-latest/ratings.csv', schema='userId INT, movieId STRING, rating FLOAT, timestamp BIGINT')
    ratings_df = ratings_df.drop('timestamp')
    
    sim_filtered_ratings_df = ratings_df.join(sim_user_df, (ratings_df.userId == sim_user_df.userIdA), "inner").drop('userIdA').cache()
#     sim_filtered_ratings_df.select("userId").distinct().show(50) #checked all unique user's ratings are here
#     sim_filtered_ratings_df.show(50)

    sim_common_ratings_df = sim_filtered_ratings_df.alias("r1") \
    .join(sim_filtered_ratings_df.alias("r2"),
          (col("r1.movieId") == col("r2.movieId")) &
          (col("r1.userId") != col("r2.userId"))) \
    .join(similarity_df, (col("r1.userId") == col("userIdA")) & (col("r2.userId") == col("userIdB"))) \
    .select(col("userIdA"), col("userIdB"), col("r1.movieId").alias("movie1"),col("r2.movieId").alias("movie2"),col("r1.rating").alias("rating1"), col("r2.rating").alias("rating2"))
    
    # Assemble ratings into vectors for correlation calculation
    vector_assembler = VectorAssembler(inputCols=["rating1", "rating2"], outputCol="features")
    sim_vector_ratings_df = vector_assembler.transform(sim_common_ratings_df)
#     sim_vector_ratings_df.filter("userIdA == 8 and userIdB == 446").show(50) #42, verified ok
#     sim_vector_ratings_df = sim_vector_ratings_df.filter("userIdA == 8 and userIdB == 446")
    sim_vector_ratings_df = sim_vector_ratings_df.drop("movie1","movie2","rating1","rating2")
#     sim_vector_ratings_df.show()


    # Calculate correlation
#     correlations = []
    sim_correlations = {}
    for row in similarity_df.collect():
        userA, userB = row.userIdA, row.userIdB
        pair_data = sim_vector_ratings_df.filter((col("userIdA") == userA) & (col("userIdB") == userB))
        if not pair_data.rdd.isEmpty():
            corr_matrix = Correlation.corr(pair_data, "features", "pearson").head()[0]
            if corr_matrix is not None:
                corr_value = corr_matrix[0, 1]  # Accessing the off-diagonal element for the correlation between two features
                sim_correlations[(userA, userB)] = corr_value
#             correlations.append(corr)

    
    print(sim_correlations)
    sim_valid_corrs = [corr for corr in sim_correlations.values() if not np.isnan(corr)]
    sim_average_corr = np.mean(sim_valid_corrs)
    print(f"top100 Average Pearson Correlation: {sim_average_corr}")

In [9]:
# Only enter this block if we're in main
if __name__ == "__main__":

    spark = SparkSession.builder \
    .appName("correlation all similar") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory", "16g") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.broadcastTimeout", "7200") \
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
    .getOrCreate()
    
    # Call our main routine
    main(spark)

24/05/14 07:55:28 WARN CacheManager: Asked to cache already cached data.
24/05/14 07:55:42 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
24/05/14 07:55:57 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
24/05/14 07:55:57 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
24/05/14 07:55:57 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
24/05/14 07:55:58 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
24/05/14 07:55:58 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
24/05/14 07:55:59 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
24/05/14 07:55:59 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
24/05/14 07:56:00 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
24/05/14 07:56:00 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
24/05/14 07:56:01

{(32, 35883): 0.14976057446828847, (32, 55181): 0.5126322784957922, (32, 71690): 0.04670993664969137, (32, 84047): -0.3855907133328033, (32, 130852): 0.1292785010551571, (32, 165473): 0.2792592697663689, (32, 290077): 0.2140935739950792, (47, 8182): 0.30523384783367985, (47, 19126): -0.19462473604038072, (47, 30403): -0.10910894511799621, (47, 38669): -1.1944098013747937e-17, (47, 47033): 7.166458808248763e-17, (47, 77618): 0.8451542547285166, (47, 81753): -0.5436067275445352, (47, 82924): 0.16529490122682158, (47, 164361): -0.256917497769354, (47, 199185): 0.38729833462074165, (47, 215796): 0.3651483716701107, (47, 221493): -0.37096452512212347, (47, 223838): 0.7905694150420947, (47, 237656): 0.46625240412015695, (47, 253373): -3.670284103903167e-17, (47, 256684): 0.4637388957601683, (47, 260423): -0.8460990892938031, (47, 305650): -3.288196813495083e-17, (47, 330557): -0.5288119308086269, (158, 107199): 0.08722404533150648, (158, 129659): -0.015653689050413578, (158, 148103): 0.79372

### Q2 - not ordered top 100 correlation

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import DenseMatrix, Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import col, concat_ws, rand

import numpy as np



def main(spark):

'''all top 100 correlation'''
    
    # 1. Read & Transform the data
    similarity_df = spark.read.csv(f'/home/hl5679/capstone-project-cap-19/q1_all_atleast5movies_top100/q1_results.csv', schema='userIdA INT, userIdB INT, JaccardDistance FLOAT')
    similarity_df = similarity_df.drop('JaccardDistance')
    sim_user_df = (similarity_df.select("userIdA")
            .union(similarity_df.select("userIdB"))
            .distinct()
            .filter("userIdA is not null"))

#     sim_user_df.show(50) # 50 unique userids
#     print(sim_user_ids.count())
    

    ratings_df = spark.read.csv(f'/home/hl5679/capstone-project-cap-19/ml-latest/ratings.csv', schema='userId INT, movieId STRING, rating FLOAT, timestamp BIGINT')
    ratings_df = ratings_df.drop('timestamp')
    
    sim_filtered_ratings_df = ratings_df.join(sim_user_df, (ratings_df.userId == sim_user_df.userIdA), "inner").drop('userIdA').cache()
#     sim_filtered_ratings_df.select("userId").distinct().show(50) #checked all unique user's ratings are here
#     sim_filtered_ratings_df.show(50)

    sim_common_ratings_df = sim_filtered_ratings_df.alias("r1") \
    .join(sim_filtered_ratings_df.alias("r2"),
          (col("r1.movieId") == col("r2.movieId")) &
          (col("r1.userId") != col("r2.userId"))) \
    .join(similarity_df, (col("r1.userId") == col("userIdA")) & (col("r2.userId") == col("userIdB"))) \
    .select(col("userIdA"), col("userIdB"), col("r1.movieId").alias("movie1"),col("r2.movieId").alias("movie2"),col("r1.rating").alias("rating1"), col("r2.rating").alias("rating2"))
    
    # Assemble ratings into vectors for correlation calculation
    vector_assembler = VectorAssembler(inputCols=["rating1", "rating2"], outputCol="features")
    sim_vector_ratings_df = vector_assembler.transform(sim_common_ratings_df)
#     sim_vector_ratings_df.filter("userIdA == 8 and userIdB == 446").show(50) #42, verified ok
#     sim_vector_ratings_df = sim_vector_ratings_df.filter("userIdA == 8 and userIdB == 446")
    sim_vector_ratings_df = sim_vector_ratings_df.drop("movie1","movie2","rating1","rating2")
#     sim_vector_ratings_df.show()


    # Calculate correlation
#     correlations = []
    sim_correlations = {}
    for row in similarity_df.collect():
        userA, userB = row.userIdA, row.userIdB
        pair_data = sim_vector_ratings_df.filter((col("userIdA") == userA) & (col("userIdB") == userB))
        if not pair_data.rdd.isEmpty():
            corr_matrix = Correlation.corr(pair_data, "features", "pearson").head()[0]
            if corr_matrix is not None:
                corr_value = corr_matrix[0, 1]  # Accessing the off-diagonal element for the correlation between two features
                sim_correlations[(userA, userB)] = corr_value
#             correlations.append(corr)

    
    print(correlations)
    sim_valid_corrs = [corr for corr in sim_correlations.values() if not np.isnan(corr)]
    sim_average_corr = np.mean(sim_valid_corrs)
    print(f"top100 Average Pearson Correlation: {sim_average_corr}")

In [5]:
# Only enter this block if we're in main
if __name__ == "__main__":

    spark = SparkSession.builder \
    .appName("correlation all similar") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory", "16g") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.broadcastTimeout", "7200") \
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
    .getOrCreate()
    
    # Call our main routine
    main(spark)

24/05/11 12:44:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/11 12:44:36 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/05/11 12:44:36 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
24/05/11 12:45:02 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
24/05/11 12:45:11 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
24/05/11 12:45:33 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
24/05/11 12:45:35 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.


{(126365, 212700): -0.2041241452319315, (19955, 221069): -0.7954951288348662, (157346, 243115): 0.30541548060657264, (116983, 193321): -0.060999428133041905, (11821, 290387): -0.2672612419124244, (6282, 42342): 0.27116307227332015, (11821, 256886): -0.17496355305594133, (126365, 164050): -0.26404445070327603, (19955, 129257): -0.5570860145311556, (123839, 254507): 0.20806259464411975, (19955, 29009): -0.5570860145311556, (226098, 278393): 0.47540983606557385, (19955, 269153): 0.0, (11308, 302349): -0.06843859108291546, (100351, 174117): -0.008261359951610172, (60507, 166734): 0.7637626158259734, (45553, 297891): 0.6507913734559682, (125979, 241804): -0.4391550328268399, (186881, 266885): 0.2581988897471611, (88779, 314723): -0.884651736929383, (126365, 314990): -0.4227971623322353, (76751, 117965): -0.42379344841321, (90557, 161092): -0.2536857024815635, (177400, 267170): -0.3306500356330541, (126365, 281537): 0.19094065395649326, (188488, 289956): 0.316227766016838, (165829, 307269): 

### Q2 random 100 correlation 

In [59]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.ml.linalg import DenseMatrix, Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import col, concat_ws, rand, collect_list, size

'''Random 100 correlation'''

def main(spark):

    
    # 1. Read & Transform the data
    ratings_df = spark.read.csv(f'/home/hl5679/capstone-project-cap-19/ml-latest-small/ratings.csv', schema='userId INT, movieId STRING, rating FLOAT, timestamp BIGINT')
    ratings_df = ratings_df.drop('timestamp')
    
    # Create random user pairs
    ratings_df_filtered = ratings_df.groupBy("userId").agg(collect_list("movieId").alias("movieIds")).filter(size("movieIds") >= 5)
#     ratings_df_filtered.show()
    user_ids = ratings_df_filtered.select("userId").distinct()
    user_pairs = user_ids.alias("a").crossJoin(user_ids.alias("b")) \
        .filter("a.userId < b.userId") \
        .select(col("a.userId").alias("userIdA"), col("b.userId").alias("userIdB"))
    
    random_pairs = user_pairs.orderBy(rand()).limit(100)
    random_pairs.show()
    

    
    ran_user_df = (random_pairs.select("userIdA")
            .union(random_pairs.select("userIdB"))
            .distinct()
            .filter("userIdA is not null"))
    
    
    ran_filtered_ratings_df = ratings_df.join(ran_user_df, (ratings_df.userId == ran_user_df.userIdA), "inner").drop('userIdA').cache()
#     ran_filtered_ratings_df.select("userId").distinct().show(50) #checked all unique user's ratings are here
#     ran_filtered_ratings_df.show(50)

    ran_common_ratings_df = ran_filtered_ratings_df.alias("r1") \
    .join(ran_filtered_ratings_df.alias("r2"),
          (col("r1.movieId") == col("r2.movieId")) &
          (col("r1.userId") != col("r2.userId"))) \
    .join(random_pairs, (col("r1.userId") == col("userIdA")) & (col("r2.userId") == col("userIdB"))) \
    .select(col("userIdA"), col("userIdB"), col("r1.movieId").alias("movie1"),col("r2.movieId").alias("movie2"),col("r1.rating").alias("rating1"), col("r2.rating").alias("rating2"))
    
    # Assemble ratings into vectors for correlation calculation
    vector_assembler = VectorAssembler(inputCols=["rating1", "rating2"], outputCol="features")
    ran_vector_ratings_df = vector_assembler.transform(ran_common_ratings_df)
    ran_vector_ratings_df = ran_vector_ratings_df.drop("movie1","movie2","rating1","rating2")
    ran_vector_ratings_df.show()


    # Calculate correlation
    ran_correlations = {}
    for row in random_pairs.collect():
        userA, userB = row.userIdA, row.userIdB
        ran_pair_data = ran_vector_ratings_df.filter((col("userIdA") == userA) & (col("userIdB") == userB))
        if ran_pair_data.count() >= 2:
            if not ran_pair_data.rdd.isEmpty():
                corr_matrix = Correlation.corr(ran_pair_data, "features", "pearson").head()[0] #Correlation.corr function requires at least two rows to compute the covariance matrix,
                if corr_matrix is not None:
                    corr_value = corr_matrix[0, 1]  # Accessing the off-diagonal element for the correlation between two features
                    ran_correlations[(userA, userB)] = corr_value
        else:
            continue
            
        
#         print("Number of rows in the DataFrame:", ran_pair_data.count())


    
    print(ran_correlations)
    ran_valid_corrs = [corr for corr in ran_correlations.values() if not np.isnan(corr)]
    ran_average_corr = np.mean(ran_valid_corrs)
    print(f"ran100 Average Pearson Correlation: {ran_average_corr}")

In [60]:
# Only enter this block if we're in main
if __name__ == "__main__":

    spark = SparkSession.builder \
    .appName("correlation all similar") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory", "16g") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.broadcastTimeout", "7200") \
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
    .getOrCreate()
    
    # Call our main routine
    main(spark)

+-------+-------+
|userIdA|userIdB|
+-------+-------+
|      6|    130|
|     53|    480|
|     86|    321|
|    328|    555|
|    118|    251|
|     22|    269|
|    221|    452|
|     51|     89|
|     46|    308|
|    124|    549|
|    186|    269|
|    172|    379|
|    157|    546|
|     90|    116|
|      5|    507|
|      8|    419|
|     46|    388|
|    519|    574|
|    112|    363|
|    156|    417|
+-------+-------+
only showing top 20 rows



                                                                                

+-------+-------+---------+
|userIdA|userIdB| features|
+-------+-------+---------+
|    328|    555|[3.5,4.0]|
|      6|    130|[2.0,4.0]|
|      8|    419|[4.0,5.0]|
|     17|    233|[5.0,4.0]|
|     42|    566|[5.0,4.0]|
|     58|    477|[5.0,4.5]|
|     67|    304|[4.5,5.0]|
|    118|    251|[4.0,5.0]|
|    140|    166|[4.0,4.5]|
|    156|    417|[5.0,5.0]|
|    173|    588|[1.0,5.0]|
|    192|    385|[3.0,4.0]|
|    275|    290|[5.0,5.0]|
|    290|    294|[5.0,4.0]|
|    328|    555|[5.0,4.0]|
|    330|    418|[4.0,4.5]|
|    334|    353|[4.0,5.0]|
|    359|    609|[4.0,4.0]|
|    404|    416|[3.0,4.0]|
|    411|    606|[5.0,5.0]|
+-------+-------+---------+
only showing top 20 rows



24/05/11 13:49:39 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
24/05/11 13:49:50 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
24/05/11 13:53:45 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
24/05/11 13:53:55 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.


{(6, 130): -0.2533406866391176, (53, 480): nan, (86, 321): -0.4264014327112209, (328, 555): -0.04175932386355734, (118, 251): nan, (221, 452): 0.030987092168430933, (51, 89): -0.1804311640889395, (46, 308): 0.15196658067558388, (124, 549): 0.1889822365046136, (186, 269): 0.0, (172, 379): -0.9999999999999998, (90, 116): 0.9999999999999998, (5, 507): 0.26038690306103024, (8, 419): 0.2842676218074807, (112, 363): -0.1590958925924491, (156, 417): 0.11411646470852956, (274, 320): -0.2770979616076669, (359, 609): 0.46291004988627577, (95, 558): 0.777713771047819, (156, 361): -0.13873210524018872, (196, 570): -0.08799637965199711, (411, 606): 0.03383518421552754, (32, 112): 0.4631820399782618, (425, 443): 0.06465130348292425, (314, 435): 0.9759000729485333, (44, 195): -0.5477225575051662, (267, 301): -0.06565321642986127, (392, 534): 0.7196763181246417, (77, 351): 0.36261333437825766, (17, 233): 0.5749889084999453, (79, 372): 0.08189371934394457, (39, 350): 0.1856953381770519, (190, 426): 0.3