# 1. Data Loading & Formatting

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import pyspark
from pyspark.sql import SparkSession
import pyspark.ml as ml

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from pyspark.sql.types import StringType, IntegerType

In [2]:
spark = SparkSession.builder.appName("YourAppName").getOrCreate()

In [3]:
def load_dfs():
    global movies, users, ratings
    movies = spark.read.csv("./Data/cleaned_data/pivoted_movies_features.csv", encoding="latin1", header=True).cache()
    movies.createOrReplaceTempView("movies_info")
    
    users = spark.read.csv("./Data/cleaned_data/pivoted_users_features.csv", encoding="latin1", header=True).cache()
    users.createOrReplaceTempView("users_info")
    
    ratings = spark.read.csv("./Data/movieLens/ratings.dat", sep="::", encoding="latin1")
    ratings = ratings.toDF("user_id", "movie_id", "rating", "time_stamp").cache()
    ratings.createOrReplaceTempView("ratings_info")

In [4]:
load_dfs()

In [5]:
movies.printSchema()

root
 |-- movie_id: string (nullable = true)
 |-- year: string (nullable = true)
 |-- watches: string (nullable = true)
 |-- avg_rating: string (nullable = true)
 |-- popularity_per_Action: string (nullable = true)
 |-- popularity_per_Adventure: string (nullable = true)
 |-- popularity_per_Animation: string (nullable = true)
 |-- popularity_per_Children's: string (nullable = true)
 |-- popularity_per_Comedy: string (nullable = true)
 |-- popularity_per_Crime: string (nullable = true)
 |-- popularity_per_Documentary: string (nullable = true)
 |-- popularity_per_Drama: string (nullable = true)
 |-- popularity_per_Fantasy: string (nullable = true)
 |-- popularity_per_Film-Noir: string (nullable = true)
 |-- popularity_per_Horror: string (nullable = true)
 |-- popularity_per_Musical: string (nullable = true)
 |-- popularity_per_Mystery: string (nullable = true)
 |-- popularity_per_Romance: string (nullable = true)
 |-- popularity_per_Sci-Fi: string (nullable = true)
 |-- popularity_per_Thr

In [6]:
for col in movies.columns:
    if col in ["movie_id", "year", "watches"]:
        movies = movies.withColumn(col, movies[col].cast(IntegerType()))
    else:
        movies = movies.withColumn(col, movies[col].cast("float"))

In [7]:
movies = movies.withColumnRenamed("avg_rating", "avg_movie_rating")
movies = movies.withColumnRenamed("year", "production_year")

In [8]:
users.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- region: string (nullable = true)
 |-- avg_rating: string (nullable = true)
 |-- watched_movies: string (nullable = true)
 |-- avg_rating_for_Action: string (nullable = true)
 |-- avg_rating_for_Adventure: string (nullable = true)
 |-- avg_rating_for_Animation: string (nullable = true)
 |-- avg_rating_for_Children's: string (nullable = true)
 |-- avg_rating_for_Comedy: string (nullable = true)
 |-- avg_rating_for_Crime: string (nullable = true)
 |-- avg_rating_for_Documentary: string (nullable = true)
 |-- avg_rating_for_Drama: string (nullable = true)
 |-- avg_rating_for_Fantasy: string (nullable = true)
 |-- avg_rating_for_Film-Noir: string (nullable = true)
 |-- avg_rating_for_Horror: string (nullable = true)
 |-- avg_rating_for_Musical: string (nullable = true)
 |-- avg_rating_for_Mystery: string (nullable = true)
 |-- avg_rating_for_Romance: string (nullable = t

In [9]:
for col in users.columns:
    if col in ["user_id", "gender", "age", "region", "age", "region", "watched_movies"]:
        users = users.withColumn(col, users[col].cast(IntegerType()))
    else:
        users = users.withColumn(col, users[col].cast("float"))

In [10]:
users = users.withColumnRenamed("avg_rating", "avg_user_rating")
users = users.withColumnRenamed("year", "favorite_movies_year")

In [11]:
ratings.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- movie_id: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- time_stamp: string (nullable = true)



In [12]:
for col in ratings.columns:
    ratings = ratings.withColumn(col, ratings[col].cast(IntegerType()))

In [13]:
movies.createOrReplaceTempView("movies_info")
users.createOrReplaceTempView("users_info")
ratings.createOrReplaceTempView("ratings_info")

In [14]:
query = """
        SELECT *
        FROM (ratings_info INNER JOIN users_info USING (user_id)) INNER JOIN movies_info USING (movie_id)
"""

result = spark.sql(query)
result.printSchema()

root
 |-- movie_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- time_stamp: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- region: integer (nullable = true)
 |-- avg_user_rating: float (nullable = true)
 |-- watched_movies: integer (nullable = true)
 |-- avg_rating_for_Action: float (nullable = true)
 |-- avg_rating_for_Adventure: float (nullable = true)
 |-- avg_rating_for_Animation: float (nullable = true)
 |-- avg_rating_for_Children's: float (nullable = true)
 |-- avg_rating_for_Comedy: float (nullable = true)
 |-- avg_rating_for_Crime: float (nullable = true)
 |-- avg_rating_for_Documentary: float (nullable = true)
 |-- avg_rating_for_Drama: float (nullable = true)
 |-- avg_rating_for_Fantasy: float (nullable = true)
 |-- avg_rating_for_Film-Noir: float (nullable = true)
 |-- avg_rating_for_Horror: float (nullable = true)
 |-- avg_rating_for_Musical: float (

In [15]:
result = result.drop("movie_id", "user_id", "time_stamp")

In [16]:
from pyspark.ml.feature import VectorAssembler

features = [col for col in result.columns if col != "rating"]
target_column = "rating"
assembler = VectorAssembler(inputCols=features, outputCol="features")
data = assembler.transform(result)

In [17]:
data.printSchema()

root
 |-- rating: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- region: integer (nullable = true)
 |-- avg_user_rating: float (nullable = true)
 |-- watched_movies: integer (nullable = true)
 |-- avg_rating_for_Action: float (nullable = true)
 |-- avg_rating_for_Adventure: float (nullable = true)
 |-- avg_rating_for_Animation: float (nullable = true)
 |-- avg_rating_for_Children's: float (nullable = true)
 |-- avg_rating_for_Comedy: float (nullable = true)
 |-- avg_rating_for_Crime: float (nullable = true)
 |-- avg_rating_for_Documentary: float (nullable = true)
 |-- avg_rating_for_Drama: float (nullable = true)
 |-- avg_rating_for_Fantasy: float (nullable = true)
 |-- avg_rating_for_Film-Noir: float (nullable = true)
 |-- avg_rating_for_Horror: float (nullable = true)
 |-- avg_rating_for_Musical: float (nullable = true)
 |-- avg_rating_for_Mystery: float (nullable = true)
 |-- avg_rating_for_Romance: float (nullable = true)
 

# 2. Train-Test Split

In [18]:
(training, test) = data.randomSplit([0.8, 0.2])

# 3. Linear Regression Fitting

In [19]:
lr = ml.regression.LinearRegression(featuresCol="features", labelCol=target_column, regParam=0.3, elasticNetParam=0.8)

In [20]:
lr_model = lr.fit(training)

In [21]:
train_predictions = lr_model.transform(training)
test_predictions = lr_model.transform(test)

In [22]:
train_predictions.select(["rating", "prediction"]).show()

+------+------------------+
|rating|        prediction|
+------+------------------+
|     1|3.1845266396817546|
|     1|3.8419226683188663|
|     1|3.7902606321505443|
|     1| 3.798528255468794|
|     1| 3.776635867538622|
|     1|3.4523816691381777|
|     1| 3.644434201423679|
|     1| 3.545338504311424|
|     1|  3.30996552347474|
|     1|3.5581640914754376|
|     1|3.6206323214036846|
|     1|3.6986150880706763|
|     1|3.7308130594719744|
|     1|  2.99173583143161|
|     1| 3.074394189793842|
|     1|3.0176331016721676|
|     1| 2.917751027789741|
|     1| 3.332640367246767|
|     1|3.0128860106958744|
|     1| 3.042390650308441|
+------+------------------+
only showing top 20 rows



In [23]:
import pyspark.ml.evaluation as evaluation

evaluator = evaluation.RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

train_rmse = evaluator.evaluate(train_predictions)
test_rmse = evaluator.evaluate(test_predictions)

print(f"For Linear regression method, RMSE for training = {train_rmse}, while for testing = {test_rmse}" )

For Linear regression method, RMSE for training = 0.9712509151951974, while for testing = 0.9715031030097021


## 3.1. Grid search for Regularization

In [24]:
results = {
    "regularization": [],
    "elasticNet": [],
    "train_rmse": [],
    "test_rmse": []
}

for regularization in [0, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 1]:
    for elasticNet in [0, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 1]:
        lr = ml.regression.LinearRegression(featuresCol="features", 
                                            labelCol=target_column, 
                                            regParam=regularization, 
                                            elasticNetParam=elasticNet)

        model = lr.fit(training)
        
        train_predictions = model.transform(training)
        test_predictions = model.transform(test)
        
        results["regularization"].append(regularization)
        results["elasticNet"].append(elasticNet)
        results["train_rmse"].append(evaluator.evaluate(train_predictions))
        results["test_rmse"].append(evaluator.evaluate(test_predictions))

In [25]:
results = pd.DataFrame(results)
results

Unnamed: 0,regularization,elasticNet,train_rmse,test_rmse
0,0.0,0.00,0.907988,0.907408
1,0.0,0.01,0.907988,0.907408
2,0.0,0.05,0.907988,0.907408
3,0.0,0.10,0.907988,0.907408
4,0.0,0.25,0.907988,0.907408
...,...,...,...,...
59,1.0,0.10,0.960985,0.961378
60,1.0,0.25,1.010064,1.010573
61,1.0,0.50,1.101791,1.102478
62,1.0,0.75,1.116678,1.117423


In [26]:
results[results["test_rmse"] == results["test_rmse"].min()]

Unnamed: 0,regularization,elasticNet,train_rmse,test_rmse
0,0.0,0.0,0.907988,0.907408
1,0.0,0.01,0.907988,0.907408
2,0.0,0.05,0.907988,0.907408
3,0.0,0.1,0.907988,0.907408
4,0.0,0.25,0.907988,0.907408
5,0.0,0.5,0.907988,0.907408
6,0.0,0.75,0.907988,0.907408
7,0.0,1.0,0.907988,0.907408


In [27]:
results.to_csv("./linear_regression.csv", header=True, index=False)