# 1. Data Loading & Formatting

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import pyspark
from pyspark.sql import SparkSession
import pyspark.ml as ml

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from pyspark.sql.types import StringType, IntegerType

In [2]:
spark = SparkSession.builder.appName("YourAppName").getOrCreate()

In [3]:
def load_dfs():
    global result
    result = spark.read.csv("./Data/cleaned_data_1/unified_rating_features.csv", encoding="latin1", header=True).cache()
    result.createOrReplaceTempView("result_info")

In [4]:
load_dfs()

In [5]:
result.printSchema()

root
 |-- movie_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- time_stamp: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- region: string (nullable = true)
 |-- avg_rating8: string (nullable = true)
 |-- avg_rating9: string (nullable = true)
 |-- watched_movies: string (nullable = true)
 |-- avg_rating_for_Action: string (nullable = true)
 |-- avg_rating_for_Adventure: string (nullable = true)
 |-- avg_rating_for_Animation: string (nullable = true)
 |-- avg_rating_for_Children's: string (nullable = true)
 |-- avg_rating_for_Comedy: string (nullable = true)
 |-- avg_rating_for_Crime: string (nullable = true)
 |-- avg_rating_for_Documentary: string (nullable = true)
 |-- avg_rating_for_Drama: string (nullable = true)
 |-- avg_rating_for_Fantasy: string (nullable = true)
 |-- avg_rating_for_Film-Noir: string (nullable = true)
 |--

In [6]:
for col in result.columns:
    if col in ["movie_id", "year", "watches", "user_id", "gender", "age", "region", "age", "region", "watched_movies"]:
        result = result.withColumn(col, result[col].cast(IntegerType()))
    else:
        result = result.withColumn(col, result[col].cast("float"))

In [7]:
result = result.drop("movie_id", "user_id", "time_stamp")

In [8]:
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="occupation", outputCol="occupation_index")
result = indexer.fit(result).transform(result)

# One-hot encoding
encoder = OneHotEncoder(inputCol="occupation_index", outputCol="occupation_encoded")
result = encoder.fit(result).transform(result)

In [9]:
result = result.drop("occupation_index", "occupation")

In [10]:
from pyspark.ml.feature import VectorAssembler

features = [col for col in result.columns if col != "rating"]
target_column = "rating"
assembler = VectorAssembler(inputCols=features, outputCol="features")
data = assembler.transform(result)

In [11]:
data.printSchema()

root
 |-- rating: float (nullable = true)
 |-- gender: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- region: integer (nullable = true)
 |-- avg_rating8: float (nullable = true)
 |-- avg_rating9: float (nullable = true)
 |-- watched_movies: integer (nullable = true)
 |-- avg_rating_for_Action: float (nullable = true)
 |-- avg_rating_for_Adventure: float (nullable = true)
 |-- avg_rating_for_Animation: float (nullable = true)
 |-- avg_rating_for_Children's: float (nullable = true)
 |-- avg_rating_for_Comedy: float (nullable = true)
 |-- avg_rating_for_Crime: float (nullable = true)
 |-- avg_rating_for_Documentary: float (nullable = true)
 |-- avg_rating_for_Drama: float (nullable = true)
 |-- avg_rating_for_Fantasy: float (nullable = true)
 |-- avg_rating_for_Film-Noir: float (nullable = true)
 |-- avg_rating_for_Horror: float (nullable = true)
 |-- avg_rating_for_Musical: float (nullable = true)
 |-- avg_rating_for_Mystery: float (nullable = true)
 |-- avg_rating_fo

# 2. Train-Test Split

In [12]:
(training, test) = data.randomSplit([0.8, 0.2])

# 3. Decision Tree Fitting

In [13]:
from pyspark.ml.feature import VectorAssembler

features = [col for col in result.columns if col != "rating"]
target_column = "rating"
assembler = VectorAssembler(inputCols=features, outputCol="features")
data = assembler.transform(result)

(training, test) = data.randomSplit([0.8, 0.2])

In [14]:
dt = ml.regression.DecisionTreeRegressor(featuresCol="features", labelCol=target_column)

In [15]:
dt_model = dt.fit(training)

In [16]:
train_predictions = dt_model.transform(training)
test_predictions = dt_model.transform(test)

In [17]:
train_predictions.select(["rating", "prediction"]).show()

+------+------------------+
|rating|        prediction|
+------+------------------+
|   1.0| 2.767195767195767|
|   1.0|3.2054069119286512|
|   1.0| 4.258020415603354|
|   1.0|3.9740315173037852|
|   1.0| 2.287023918659888|
|   1.0| 2.287023918659888|
|   1.0|3.6702985127857626|
|   1.0| 1.997671904880685|
|   1.0|4.1774410774410775|
|   1.0| 2.767195767195767|
|   1.0| 1.997671904880685|
|   1.0| 2.287023918659888|
|   1.0| 2.287023918659888|
|   1.0| 2.287023918659888|
|   1.0| 3.238777908343126|
|   1.0| 3.238777908343126|
|   1.0|3.6702985127857626|
|   1.0| 3.247998418503509|
|   1.0|3.9740315173037852|
|   1.0|2.4873834962491475|
+------+------------------+
only showing top 20 rows



In [18]:
import pyspark.ml.evaluation as evaluation

evaluator = evaluation.RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

train_rmse = evaluator.evaluate(train_predictions)
test_rmse = evaluator.evaluate(test_predictions)

print(f"For Decision Tree method, RMSE for training = {train_rmse}, while for testing = {test_rmse}" )

For Decision Tree method, RMSE for training = 0.9226861131843943, while for testing = 0.9223384917219788


## 3.1. Grid search for Hyperparameters

In [19]:
import gc

results = {
    "maxDepth": [],
    "train_rmse": [],
    "test_rmse": []
}

for maxDepth in [3, 4, 5, 6, 7, 8, 9, 10]:
    dt = ml.regression.DecisionTreeRegressor(featuresCol="features", labelCol=target_column, maxDepth=maxDepth)

    model = dt.fit(training)

    train_predictions = model.transform(training)
    test_predictions = model.transform(test)

    results["maxDepth"].append(maxDepth)
    results["train_rmse"].append(evaluator.evaluate(train_predictions))
    results["test_rmse"].append(evaluator.evaluate(test_predictions))
    
    del dt, model, train_predictions, test_predictions
    gc.collect()

In [20]:
results = pd.DataFrame(results)
results

Unnamed: 0,maxDepth,train_rmse,test_rmse
0,3,0.951204,0.951319
1,4,0.933169,0.933082
2,5,0.922686,0.922338
3,6,0.915845,0.915822
4,7,0.910959,0.911669
5,8,0.905905,0.907761
6,9,0.900314,0.904538
7,10,0.89371,0.901793


In [21]:
results[results["test_rmse"] == results["test_rmse"].min()]

Unnamed: 0,maxDepth,train_rmse,test_rmse
7,10,0.89371,0.901793


In [22]:
results.to_csv("./decision_tree.csv", header=True, index=False)