# Workflow demonstration with a recommender engine on a sampled dataset from Transactions.csv using ALS Model
### This is the notebook for deployment

### Importing the libraries and starting the Spark Session

In [21]:
import pyspark.sql.functions as sql_func
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.context import SparkContext 
from pyspark.sql.session import SparkSession
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark.ml.evaluation import RegressionEvaluator

In [22]:
sc =SparkContext.getOrCreate()

In [23]:
#from pyspark.context import 
spark = SparkSession(sc)

### Add asset from remote connection 

In [24]:
import dsx_core_utils, requests, os, io
from pyspark.sql import SparkSession

final_stat = None
dataSet = dsx_core_utils.get_remote_data_set_info('TRANSACTIONS')
dataSource = dsx_core_utils.get_data_source_info(dataSet['datasource'])
sparkSession = SparkSession(sc).builder.getOrCreate()
# Load JDBC data to Spark dataframe
dbTableOrQuery = '"' + (dataSet['schema'] + '"."' if(len(dataSet['schema'].strip()) != 0) else '') + dataSet['table'] + '"'
if (dataSet['query']):
    dbTableOrQuery = "(" + dataSet['query'] + ") TBL"
final_stat = sparkSession.read.format("jdbc").option("url", dataSource['URL']).option("dbtable", dbTableOrQuery).option("user",dataSource['user']).option("password",dataSource['password']).load()
final_stat.show(5)

+-------+--------+---------+
| ID_CTE|ID_CLAS1|FREQUENCY|
+-------+--------+---------+
|3973618|    1313|        1|
|3973618|    1319|        5|
|3973618|    1327|        2|
|3973618|    2142|        1|
|3973618|    2302|        1|
+-------+--------+---------+
only showing top 5 rows



### Preparing data for the model

In [25]:
ratings = (final_stat
    .select(
        'ID_CTE',
        'ID_CLAS1',
        'FREQUENCY',
    )
).cache()

In [26]:
ratings.dtypes

[('ID_CTE', 'int'), ('ID_CLAS1', 'int'), ('FREQUENCY', 'int')]

In [27]:
ratings.limit(10000)

DataFrame[ID_CTE: int, ID_CLAS1: int, FREQUENCY: int]

### Spliting the data set to test and train for measuring the performance of the ALS Model

In [28]:
(training, test) = ratings.randomSplit([0.8, 0.2])



### Build the recommendation model using ALS on the training data


In [29]:
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=2, regParam=0.01, 
          userCol="ID_CTE", itemCol="ID_CLAS1", ratingCol="FREQUENCY",
          coldStartStrategy="drop",
          implicitPrefs=True)

model = als.fit(ratings)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="FREQUENCY",
                                predictionCol="prediction")

rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.9599291408031267


## Parameters of ALS Model in PySpark realization are following:

##### NumBlocks is the number of blocks the users and items will be partitioned into in order to parallelize computation.
##### rank is the number of latent factors in the model.
##### maxIter is the maximum number of iterations to run.
##### regParam specifies the regularization parameter in ALS.
##### implicitPrefs specifies whether to use the explicit feedback ALS variant or one adapted for implicit feedback data (defaults to false which means using explicit feedback).
##### alpha is a parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations (defaults to 1.0)

###  Generate top 10 Item recommendations for each user



In [30]:
userRecs = model.recommendForAllUsers(10)
userRecs.count()



905251

In [31]:
userRecs.take(10)

[Row(ID_CTE=3974081, recommendations=[Row(ID_CLAS1=2229, rating=0.9145445227622986), Row(ID_CLAS1=2224, rating=0.9050465822219849), Row(ID_CLAS1=2295, rating=0.8862479329109192), Row(ID_CLAS1=2210, rating=0.32449960708618164), Row(ID_CLAS1=2248, rating=0.2460334151983261), Row(ID_CLAS1=2299, rating=0.2302532196044922), Row(ID_CLAS1=2240, rating=0.22628401219844818), Row(ID_CLAS1=1868, rating=0.2160385549068451), Row(ID_CLAS1=2244, rating=0.21348370611667633), Row(ID_CLAS1=1774, rating=0.20316047966480255)]),
 Row(ID_CTE=3974508, recommendations=[Row(ID_CLAS1=1380, rating=1.3829349279403687), Row(ID_CLAS1=1105, rating=1.1871720552444458), Row(ID_CLAS1=1102, rating=1.1801692247390747), Row(ID_CLAS1=2224, rating=1.1721549034118652), Row(ID_CLAS1=2229, rating=1.1496870517730713), Row(ID_CLAS1=1319, rating=1.1332440376281738), Row(ID_CLAS1=1110, rating=1.128908634185791), Row(ID_CLAS1=1310, rating=1.1223106384277344), Row(ID_CLAS1=1864, rating=1.1146060228347778), Row(ID_CLAS1=1861, rating=

### Display the recommendations and get them in the correct format

In [32]:
from pyspark.sql.functions import explode
userRecs1=userRecs.withColumn("recommendations", explode(userRecs.recommendations))
userRecs1.show()

+-------+-----------------+
| ID_CTE|  recommendations|
+-------+-----------------+
|3974081| [2229,0.9145445]|
|3974081| [2224,0.9050466]|
|3974081|[2295,0.88624793]|
|3974081| [2210,0.3244996]|
|3974081|[2248,0.24603342]|
|3974081|[2299,0.23025322]|
|3974081|[2240,0.22628401]|
|3974081|[1868,0.21603855]|
|3974081| [2244,0.2134837]|
|3974081|[1774,0.20316048]|
|3974508| [1380,1.3829349]|
|3974508|  [1105,1.187172]|
|3974508| [1102,1.1801692]|
|3974508| [2224,1.1721549]|
|3974508|  [2229,1.149687]|
|3974508|  [1319,1.133244]|
|3974508| [1110,1.1289086]|
|3974508| [1310,1.1223106]|
|3974508|  [1864,1.114606]|
|3974508| [1861,1.0966117]|
+-------+-----------------+
only showing top 20 rows



####  Breaking down reach recommendation to separate columns

In [33]:
import select as s


In [34]:

userRecs1= userRecs1 \
  .select('ID_CTE', 'recommendations.*')    
   

### Display the results

In [35]:
userRecs1.show() 

+-------+--------+----------+
| ID_CTE|ID_CLAS1|    rating|
+-------+--------+----------+
|3974081|    2229| 0.9145445|
|3974081|    2224| 0.9050466|
|3974081|    2295|0.88624793|
|3974081|    2210| 0.3244996|
|3974081|    2248|0.24603342|
|3974081|    2299|0.23025322|
|3974081|    2240|0.22628401|
|3974081|    1868|0.21603855|
|3974081|    2244| 0.2134837|
|3974081|    1774|0.20316048|
|3974508|    1380| 1.3829349|
|3974508|    1105|  1.187172|
|3974508|    1102| 1.1801692|
|3974508|    2224| 1.1721549|
|3974508|    2229|  1.149687|
|3974508|    1319|  1.133244|
|3974508|    1110| 1.1289086|
|3974508|    1310| 1.1223106|
|3974508|    1864|  1.114606|
|3974508|    1861| 1.0966117|
+-------+--------+----------+
only showing top 20 rows



### Writing the Output back to the Remote Datasource

In [36]:
new_table_name = 'RecommendationsResult'
userRecs1.coalesce(1).write \
   .format("jdbc") \
    .mode('overwrite') \
    .option("url", dataSource['URL']) \
    .option("dbtable", dataSet['schema']+"."+new_table_name) \
    .option("user", dataSource['user']) \
    .option("password", dataSource['password']) \
    .save()

In [37]:
from dsx_ml.ml import save


In [40]:
type(model)

pyspark.ml.recommendation.ALSModel

In [41]:
from pyspark.ml import Pipeline

In [42]:
pipeline = Pipeline(stages=[model])

In [43]:
model_alsWML = pipeline.fit(ratings)

In [46]:
save(name = 'PySparkRecommenderPipeline',
     model = model_alsWML,
     test_data = ratings,
     algorithm_type = 'Classification',
     source='PySparkRecommenderWithWorkflow.ipynb',
     description='Recommender using PySpark')

{'path': '/user-home/1003/DSX_Projects/Recommender-engine/models/PySparkRecommenderPipeline/2',
 'scoring_endpoint': 'https://dsxl-api/v3/project/score/Python35/spark-2.2/Recommender-engine/PySparkRecommenderPipeline/2'}