# Model Experiments

In [33]:
import sys
import os
%load_ext autoreload
%autoreload 2

sys.path.append(os.path.abspath("../src"))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Spark + Data Initialisation

In [27]:
from utils.spark_session import get_spark
from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType, FloatType

spark = get_spark()

# schema structure
ratings_schema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("movieId", IntegerType(), True),
    StructField("rating", FloatType(), True),
    StructField("timestamp", TimestampType(), True)
])

# load csv as dataframe
ratings = spark.read.csv("../data/raw/rating.csv", schema=ratings_schema, header=True)
ratings.printSchema()
ratings.show(5)

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)
 |-- timestamp: timestamp (nullable = true)

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|      2|   3.5|2005-04-02 23:53:47|
|     1|     29|   3.5|2005-04-02 23:31:16|
|     1|     32|   3.5|2005-04-02 23:33:39|
|     1|     47|   3.5|2005-04-02 23:32:07|
|     1|     50|   3.5|2005-04-02 23:29:40|
+------+-------+------+-------------------+
only showing top 5 rows



## Data Split
To minimise data leakage, training data will be split chronologically so that future user preferences aren't erroneously used to train predictions.

To avoid cold-start problems, each user will be represented in both the training and test dataset. Having a high min. number of ratings per user of 20 allows this.

In [49]:
from importlib import reload
import data.split_data
reload(data.split_data)

from data.split_data import chron_user_tt_split

train, test = chron_user_tt_split(ratings, 'userId', 'timestamp', 0.8)

In [52]:
# Actual split
c_train = train.count()
c_test = test.count()
c_train / (c_train + c_test)

                                                                                

0.8021461517781041