In [1]:
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator




In [2]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
als = ALS(userCol="User", itemCol="Movie", ratingCol="rating", nonnegative = True, implicitPrefs = False)

In [3]:
param_grid = ParamGridBuilder() \
           .addGrid(als.rank, [8]) \
           .addGrid(als.maxIter, [24]) \
           .addGrid(als.regParam, [.2]) \
           .build()

In [4]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")



In [5]:
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator,numFolds=3)

In [6]:
from blend import *
data_train,data_test,data_actual_train,data_actual_predict = make_datasets()

In [None]:
data_test.to_csv("spark_test.csv")

In [7]:
data = spark.read.format("csv").option("header", "true")\
    .load("actual_train.csv")

In [None]:
data.sample

In [8]:
from pyspark.sql.types import IntegerType
data = data.withColumn("User", data["User"].cast(IntegerType()))
data = data.withColumn("Movie", data["Movie"].cast(IntegerType()))
data = data.withColumn("rating", data["rating"].cast(IntegerType()))


In [9]:
data =data.drop('_c0')

In [None]:
data

In [None]:
cvModel = cv.fit(data)

In [None]:
cv.getEstimatorParamMaps

In [None]:
bestmodel = cvModel.bestModel

In [None]:
bestmodel.rank

In [None]:
(bestmodel
    ._java_obj     # Get Java object
    .parent()      # Get parent (ALS estimator)
    .getRegParam())

In [None]:
x = cvModel.transform(data)

In [None]:
pd_df = x.toPandas()

In [None]:
predict = pd_df["prediction"].values

In [None]:
target = data_test["Rating"].values

In [None]:
calculate_rmse(predict,target)

In [None]:
target = testi["prediction"].values