In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list, rand

In [2]:
def train_val_test_split(df):

    filtered_unique_user = df.groupBy('userId').agg({'rating': 'count'}).withColumnRenamed('count(rating)', 'rating_count')
    filtered_unique_user = filtered_unique_user.filter(col('rating_count')>=10).select('userId')
    
    filtered_unique_user = filtered_unique_user.withColumn('rand', rand(seed=2024))
    filtered_unique_user = filtered_unique_user.orderBy('rand')
    filtered_unique_user = filtered_unique_user.drop(col('rand'))
    
    split_point = int(filtered_unique_user.count() * 0.7)

    train_users = filtered_unique_user.limit(split_point)
    val_test_users = filtered_unique_user.subtract(train_users)

    val_test_users = val_test_users.withColumn('rand', rand(seed=2024))
    val_test_users = val_test_users.orderBy('rand')
    val_test_users = val_test_users.drop(col('rand'))
    
    split_point = int(val_test_users.count() * 0.5)

    val_users = val_test_users.limit(split_point)
    test_users = val_test_users.subtract(val_users)
    
    print(train_users.count())
    print(val_test_users.count())
    print(val_users.count())
    print(test_users.count())
    
    train_ratings = df.join(train_users, on='userId', how='inner')
    val_ratings = df.join(val_users, on='userId', how='inner')
    test_ratings = df.join(test_users, on='userId', how='inner')
    
    val_users_list = [row["userId"] for row in val_users.collect()]
    test_users_list = [row["userId"] for row in test_users.collect()]
    
    val_fractions = dict(zip(val_users_list, [0.5 for _ in range(len(val_users_list))]))
    test_fractions = dict(zip(test_users_list, [0.5 for _ in range(len(test_users_list))]))
        
    val_train = val_ratings.sampleBy('userId', fractions=val_fractions, seed=2024)
    validation = val_ratings.subtract(val_train)

    test_train = test_ratings.sampleBy('userId', fractions=test_fractions, seed=2024)
    test = test_ratings.subtract(test_train)
    
    train = train_ratings.union(val_train).union(test_train)
    
    return train, validation, test

In [3]:
def main(spark, userID):

    ratings_small = spark.read.csv('/scratch/sjm643/sp24_bigd/ml-latest-small/ratings.csv', schema='userId INT, movieId INT, rating FLOAT, timestamp INT')

    print('ratings_smll schema')
    ratings_small.printSchema()

    train_small, val_small, test_small = train_val_test_split(ratings_small)
    
    print('train small', train_small.count(), train_small.select('userId').distinct().count())
    print('val small', val_small.count(), val_small.select('userId').distinct().count())
    print('test small', test_small.count(), test_small.select('userId').distinct().count())
    
    print('Save as parquet')
    train_small.write.parquet('/scratch/sjm643/sp24_bigd/rec_small/train.parquet')
    val_small.write.parquet('/scratch/sjm643/sp24_bigd/rec_small/val.parquet')
    test_small.write.parquet('/scratch/sjm643/sp24_bigd/rec_small/test.parquet')

In [4]:
if __name__ == "__main__":

    spark = SparkSession.builder.appName('q3').getOrCreate()

    userID = os.environ['USER']

    main(spark, userID)

24/05/11 15:28:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


ratings_smll schema
root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)
 |-- timestamp: integer (nullable = true)



                                                                                

427
183
91
92
train small 83554 610


                                                                                

val small 9923 91
test small 7359 92
Save as parquet


24/05/11 15:28:47 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/05/11 15:28:47 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
24/05/11 15:28:47 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [5]:
def main(spark, userID):

    ratings = spark.read.csv('/scratch/sjm643/sp24_bigd/ml-latest/ratings.csv', schema='userId INT, movieId INT, rating FLOAT, timestamp INT')

    print('ratings schema')
    ratings.printSchema()

    train, val, test = train_val_test_split(ratings)        
    
    print('train', train.count(), train.select('userId').distinct().count())
    print('val', val.count(), val.select('userId').distinct().count())
    print('test', test.count(), test.select('userId').distinct().count())  
    
    print('Save as parquet')
    train.write.parquet('/scratch/sjm643/sp24_bigd/rec/train.parquet')
    val.write.parquet('/scratch/sjm643/sp24_bigd/rec/val.parquet')
    test.write.parquet('/scratch/sjm643/sp24_bigd/rec/test.parquet')

In [6]:
if __name__ == "__main__":

    spark = SparkSession.builder.appName('q3').getOrCreate()

    userID = os.environ['USER']

    main(spark, userID)

ratings schema
root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)
 |-- timestamp: integer (nullable = true)



                                                                                

195739


                                                                                

83889


                                                                                

41944


                                                                                

41945


24/05/11 15:29:43 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB
24/05/11 15:30:06 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB
24/05/11 15:30:19 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB
                                                                                

train 28605216 279624


24/05/11 15:30:27 WARN DAGScheduler: Broadcasting large task binary with size 1007.9 KiB
24/05/11 15:30:37 WARN DAGScheduler: Broadcasting large task binary with size 1055.0 KiB
24/05/11 15:30:43 WARN DAGScheduler: Broadcasting large task binary with size 1055.0 KiB
24/05/11 15:30:50 WARN DAGScheduler: Broadcasting large task binary with size 1007.9 KiB
24/05/11 15:31:00 WARN DAGScheduler: Broadcasting large task binary with size 1055.0 KiB
24/05/11 15:31:05 WARN DAGScheduler: Broadcasting large task binary with size 1057.7 KiB
24/05/11 15:31:09 WARN DAGScheduler: Broadcasting large task binary with size 1060.7 KiB
                                                                                

val 2506881 41944


24/05/11 15:31:25 WARN DAGScheduler: Broadcasting large task binary with size 1080.7 KiB
24/05/11 15:31:36 WARN DAGScheduler: Broadcasting large task binary with size 1063.6 KiB
24/05/11 15:31:45 WARN DAGScheduler: Broadcasting large task binary with size 1115.4 KiB
24/05/11 15:32:04 WARN DAGScheduler: Broadcasting large task binary with size 1080.7 KiB
24/05/11 15:32:16 WARN DAGScheduler: Broadcasting large task binary with size 1063.6 KiB
24/05/11 15:32:25 WARN DAGScheduler: Broadcasting large task binary with size 1124.7 KiB
                                                                                

test 2486459 41942
Save as parquet


24/05/11 15:32:43 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
24/05/11 15:32:44 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/05/11 15:32:44 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
24/05/11 15:32:44 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
24/05/11 15:32:44 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
24/05/11 15:32:44 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
24/05/11 15:32:44 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 58

24/05/11 15:32:55 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
24/05/11 15:32:55 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
24/05/11 15:32:55 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 58.46% for 13 writers
24/05/11 15:32:55 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
24/05/11 15:32:56 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
24/05/11 15:32:56 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
24/05/11 15:32:56 WARN MemoryManager: Total allocation exceeds 9