In [None]:
# (1) Import the required Python dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.ml.linalg import DenseVector
from pyspark.mllib.linalg.distributed import RowMatrix

In [None]:
# (2) Instantiate a Spark Context
conf = SparkConf().setMaster("local").setAppName("Principal Component Analysis - Movie Ratings").set("spark.driver.memory", "4g")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [None]:
# (3) Load the Pivoted User Movie Ratings into a Spark DataFrame and examine its dimensions
user_movie_ratings_df = sqlContext.read.format('com.databricks.spark.csv').options(header = 'true', inferschema = 'true', delimiter = '|').load('./data/movie-ratings-data/user-movie-ratings.csv')
print((user_movie_ratings_df.count(), len(user_movie_ratings_df.columns)))

In [None]:
# (4) Generate MLlib Feature Vectors from all the 3000 (i.e. minus userId column) dimensions (movies)
feature_columns = user_movie_ratings_df.columns
feature_columns.remove('userId')
vector_assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'features')
user_movie_ratings_features_df = vector_assembler.transform(user_movie_ratings_df).select(['userId', 'features'])
user_movie_ratings_features_df.show()

In [None]:
# (5) Standardise the data by scaling the features to have zero mean and unit standard deviation
standardizer = StandardScaler(withMean=True, withStd=True, inputCol='features', outputCol='std_features')
standardizer_model = standardizer.fit(user_movie_ratings_features_df)
user_movie_ratings_standardized_features_df = standardizer_model.transform(user_movie_ratings_features_df)
user_movie_ratings_standardized_features_df.show()

In [None]:
# (6) Generate a RowMatrix (distributed Matrix with no index where each Row is a vector) from the scaled features DataFrame
scaled_features_rows_rdd = user_movie_ratings_standardized_features_df.select("std_features").rdd
scaled_features_matrix = RowMatrix(scaled_features_rows_rdd.map(lambda x: x[0].tolist()))
print("Scaled Features Matrix Dimensions: \n")
print((scaled_features_matrix.numRows(), scaled_features_matrix.numCols()))
print("\nScaled Features Matrix (1st Row/Vector with 3000 elements): \n")
scaled_features_matrix_collected = scaled_features_matrix.rows.collect()
print(scaled_features_matrix_collected[0])

In [None]:
# (7) Compute the top 300 principal components (eigenvectors sorted by their corresponding eigenvalues)
number_principal_components = 300
principal_components = scaled_features_matrix.computePrincipalComponents(number_principal_components)
print("Top %d Principal Components: \n" % number_principal_components)
print(principal_components)

In [None]:
# (8) Project the original User Movie Ratings dataset from 3000 dimensions into 300 dimensions
# (via Matrix multiplication of the scaled features matrix with the matrix of principal components)
projected_matrix = scaled_features_matrix.multiply(principal_components)
print("Projected Matrix Dimensions: \n")
print((projected_matrix.numRows(), projected_matrix.numCols()))
print("\nProjected Matrix (1st Row/Vector with 300 elements): \n")
projected_matrix_collected = projected_matrix.rows.collect()
print(projected_matrix_collected[0])

In [None]:
# (9) Alternatively use MLlib's PCA estimator directly on the scaled DataFrame
pca = PCA(k=number_principal_components, inputCol="std_features", outputCol="pca_features")
pca_model = pca.fit(user_movie_ratings_standardized_features_df)
user_movie_ratings_pca_df = pca_model.transform(user_movie_ratings_standardized_features_df)
user_movie_ratings_pca_df.show()

In [None]:
# (10) Extract the Explained Variance (vector of proportions of variance explained) for each Principal Component
pca_model.explainedVariance

In [None]:
# (11) Stop the Spark Context
sc.stop()