# bigdata-LA2 tutorial

## 1. Spark DataFrame API
### SparkSession

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("bigdata-LA2") \
    .getOrCreate()
print("spark version:", spark.version)

### Create dataframe

In [None]:
df = spark.createDataFrame([(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), \
                            (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)], \
                           ["user", "item", "rating"])
df.show()

### Dataframe API 

In [None]:
# get average rating
mean = df.select("rating").groupBy().avg().take(1)[0][0]
print(mean)

# get average rating by user
user_mean = df.groupBy("user").avg("rating").withColumnRenamed("avg(rating)", "user-mean")
user_mean.show()

# get average rating by item
item_mean = df.groupBy("item").avg("rating").withColumnRenamed("avg(rating)", "item-mean")
item_mean.show()

def get_bias(df):
    df = df.join(user_mean, "user").join(item_mean, "item")
    return df.withColumn("bias", df.rating-(df["user-mean"]+df["item-mean"]-mean))

# add new column bias rating
bias_df = get_bias(df)
bias_df.show()

## 2. Recommender systems

[Collaborative Filtering](http://spark.apache.org/docs/latest/ml-collaborative-filtering.html)

### Basic recommendation

In [None]:
test = spark.createDataFrame([(0, 2, 3.0), (1, 0, 3.0), (2, 0, 3.0)], \
                           ["user", "item", "rating"])
test.show()

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
als = ALS(rank=10, maxIter=5, seed=0)
model = als.fit(df)

In [None]:
# prediction
predictions = model.transform(test)
predictions.show()

In [None]:
# Evaluate the model by computing the RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("RMSE:",rmse)

### Recommendation with bias

In [None]:
test = get_bias(test)
test.show()

In [None]:
als = ALS(rank=10, maxIter=5, seed=0, ratingCol="bias")
model = als.fit(bias_df)

In [None]:
# prediction
predictions = model.transform(test)
predictions.show()

In [None]:
# Evaluate the model by the final rating
predictions_final = predictions.withColumn("final_ratings", predictions["user-mean"]+predictions["item-mean"]-mean+predictions["prediction"])

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="final_ratings")
rmse = evaluator.evaluate(predictions_final)
predictions_final.show()
print("RMSE:",rmse)

In [None]:
spark.stop()

[ALS Python docs](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.recommendation.ALS)