# 1. Data Loading & Formatting

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import pyspark
from pyspark.sql import SparkSession
import pyspark.ml as ml

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from pyspark.sql.types import StringType, IntegerType

In [2]:
spark = SparkSession.builder.appName("YourAppName").getOrCreate()

In [3]:
def load_dfs():
    global movies, users, ratings
    movies = spark.read.csv("./Data/cleaned_data/pivoted_movies_features.csv", encoding="latin1", header=True).cache()
    movies.createOrReplaceTempView("movies_info")
    
    users = spark.read.csv("./Data/cleaned_data/pivoted_users_features.csv", encoding="latin1", header=True).cache()
    users.createOrReplaceTempView("users_info")
    
    ratings = spark.read.csv("./Data/movieLens/ratings.dat", sep="::", encoding="latin1")
    ratings = ratings.toDF("user_id", "movie_id", "rating", "time_stamp").cache()
    ratings.createOrReplaceTempView("ratings_info")

In [4]:
load_dfs()

In [5]:
movies.printSchema()

root
 |-- movie_id: string (nullable = true)
 |-- year: string (nullable = true)
 |-- watches: string (nullable = true)
 |-- avg_rating: string (nullable = true)
 |-- popularity_per_Action: string (nullable = true)
 |-- popularity_per_Adventure: string (nullable = true)
 |-- popularity_per_Animation: string (nullable = true)
 |-- popularity_per_Children's: string (nullable = true)
 |-- popularity_per_Comedy: string (nullable = true)
 |-- popularity_per_Crime: string (nullable = true)
 |-- popularity_per_Documentary: string (nullable = true)
 |-- popularity_per_Drama: string (nullable = true)
 |-- popularity_per_Fantasy: string (nullable = true)
 |-- popularity_per_Film-Noir: string (nullable = true)
 |-- popularity_per_Horror: string (nullable = true)
 |-- popularity_per_Musical: string (nullable = true)
 |-- popularity_per_Mystery: string (nullable = true)
 |-- popularity_per_Romance: string (nullable = true)
 |-- popularity_per_Sci-Fi: string (nullable = true)
 |-- popularity_per_Thr

In [8]:
for col in movies.columns:
    if col in ["movie_id", "year", "watches"]:
        movies = movies.withColumn(col, movies[col].cast(IntegerType()))
    else:
        movies = movies.withColumn(col, movies[col].cast("float"))

In [13]:
movies = movies.withColumnRenamed("avg_rating", "avg_movie_rating")
movies = movies.withColumnRenamed("year", "production_year")

In [9]:
users.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- region: string (nullable = true)
 |-- avg_rating: string (nullable = true)
 |-- watched_movies: string (nullable = true)
 |-- avg_rating_for_Action: string (nullable = true)
 |-- avg_rating_for_Adventure: string (nullable = true)
 |-- avg_rating_for_Animation: string (nullable = true)
 |-- avg_rating_for_Children's: string (nullable = true)
 |-- avg_rating_for_Comedy: string (nullable = true)
 |-- avg_rating_for_Crime: string (nullable = true)
 |-- avg_rating_for_Documentary: string (nullable = true)
 |-- avg_rating_for_Drama: string (nullable = true)
 |-- avg_rating_for_Fantasy: string (nullable = true)
 |-- avg_rating_for_Film-Noir: string (nullable = true)
 |-- avg_rating_for_Horror: string (nullable = true)
 |-- avg_rating_for_Musical: string (nullable = true)
 |-- avg_rating_for_Mystery: string (nullable = true)
 |-- avg_rating_for_Romance: string (nullable = t

In [10]:
for col in users.columns:
    if col in ["user_id", "gender", "age", "region", "age", "region", "watched_movies"]:
        users = users.withColumn(col, users[col].cast(IntegerType()))
    else:
        users = users.withColumn(col, users[col].cast("float"))

In [14]:
users = users.withColumnRenamed("avg_rating", "avg_user_rating")
users = users.withColumnRenamed("year", "favorite_movies_year")

In [15]:
ratings.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- movie_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- time_stamp: integer (nullable = true)



In [16]:
for col in ratings.columns:
    ratings = ratings.withColumn(col, ratings[col].cast(IntegerType()))

In [17]:
movies.createOrReplaceTempView("movies_info")
users.createOrReplaceTempView("users_info")
ratings.createOrReplaceTempView("ratings_info")

In [18]:
query = """
        SELECT *
        FROM (ratings_info INNER JOIN users_info USING (user_id)) INNER JOIN movies_info USING (movie_id)
"""

result = spark.sql(query)
result.printSchema()

root
 |-- movie_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- time_stamp: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- region: integer (nullable = true)
 |-- avg_user_rating: float (nullable = true)
 |-- watched_movies: integer (nullable = true)
 |-- avg_rating_for_Action: float (nullable = true)
 |-- avg_rating_for_Adventure: float (nullable = true)
 |-- avg_rating_for_Animation: float (nullable = true)
 |-- avg_rating_for_Children's: float (nullable = true)
 |-- avg_rating_for_Comedy: float (nullable = true)
 |-- avg_rating_for_Crime: float (nullable = true)
 |-- avg_rating_for_Documentary: float (nullable = true)
 |-- avg_rating_for_Drama: float (nullable = true)
 |-- avg_rating_for_Fantasy: float (nullable = true)
 |-- avg_rating_for_Film-Noir: float (nullable = true)
 |-- avg_rating_for_Horror: float (nullable = true)
 |-- avg_rating_for_Musical: float (

In [19]:
result = result.drop("movie_id", "user_id", "time_stamp")

# 2. Train-Test Split

In [20]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [22]:
dir(ml.regression)

['ABCMeta',
 'AFTSurvivalRegression',
 'AFTSurvivalRegressionModel',
 'Any',
 'DataFrame',
 'DecisionTreeRegressionModel',
 'DecisionTreeRegressor',
 'Dict',
 'FMRegressionModel',
 'FMRegressor',
 'GBTRegressionModel',
 'GBTRegressor',
 'GeneralJavaMLWritable',
 'GeneralizedLinearRegression',
 'GeneralizedLinearRegressionModel',
 'GeneralizedLinearRegressionSummary',
 'GeneralizedLinearRegressionTrainingSummary',
 'Generic',
 'HasAggregationDepth',
 'HasElasticNetParam',
 'HasFeaturesCol',
 'HasFitIntercept',
 'HasLabelCol',
 'HasLoss',
 'HasMaxBlockSizeInMB',
 'HasMaxIter',
 'HasPredictionCol',
 'HasRegParam',
 'HasSeed',
 'HasSolver',
 'HasStandardization',
 'HasStepSize',
 'HasTol',
 'HasTrainingSummary',
 'HasVarianceCol',
 'HasWeightCol',
 'IsotonicRegression',
 'IsotonicRegressionModel',
 'JM',
 'JavaEstimator',
 'JavaMLReadable',
 'JavaMLWritable',
 'JavaModel',
 'JavaPredictionModel',
 'JavaPredictor',
 'JavaTransformer',
 'JavaWrapper',
 'LinearRegression',
 'LinearRegressionM

In [None]:
GeneralizedLinearRegression
LinearRegressionModel
RandomForestRegressionModel
DecisionTreeRegressionModel