# Workflow demonstration with a recommender engine on a sampled dataset from Transactions.csv using ALS Model
### This is the notebook for deployment

In [None]:
# LIBRARIES
!pip install pyspark

### Importing the libraries and starting the Spark Session

In [None]:
import pyspark.sql.functions as sql_func
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.context import SparkContext 
from pyspark.sql.session import SparkSession
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
sc =SparkContext.getOrCreate()

In [None]:
#from pyspark.context import 
spark = SparkSession(sc)

### Add asset from remote connection 

In [None]:
import dsx_core_utils, requests, os, io
from pyspark.sql import SparkSession

final_stat = None
#dataSet = dsx_core_utils.get_remote_data_set_info('TRANSACTIONS')
#dataSource = dsx_core_utils.get_data_source_info(dataSet['datasource'])
#sparkSession = SparkSession(sc).builder.getOrCreate()
## Load JDBC data to Spark dataframe
#dbTableOrQuery = '"' + (dataSet['schema'] + '"."' if(len(dataSet['schema'].strip()) != 0) else '') + dataSet['table'] + '"'
#if (dataSet['query']):
 #   dbTableOrQuery = "(" + dataSet['query'] + ") TBL"
#final_stat = sparkSession.read.format("jdbc").option("url", dataSource['URL']).option("dbtable", dbTableOrQuery).option("user",dataSource['user']).option("password",dataSource['password']).load()
#final_stat.show(5)

####### VAMOS A PROBARLO LOCALMENTE CON LA TABLE DE bq dec16-nov17_Jalisco.csv

In [None]:
# Get data from GCP- BQ
'''SELECT * 
FROM `rmf2gcp.RawData.Workflow_aggregado`
limit 31058860#310 588 606 ''' # corre en mi local y pesa 

### Preparing data for the model

In [None]:
ratings = (final_stat
    .select(
        'ID_CTE',
        'ID_CLAS1',
        'FREQUENCY',
    )
).cache()

In [None]:
ratings.dtypes

In [None]:
ratings.limit(10000)

### Spliting the data set to test and train for measuring the performance of the ALS Model

In [None]:
(training, test) = ratings.randomSplit([0.8, 0.2])



### Build the recommendation model using ALS on the training data


In [None]:
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=2, regParam=0.01, 
          userCol="ID_CTE", itemCol="ID_CLAS1", ratingCol="FREQUENCY",
          coldStartStrategy="drop",
          implicitPrefs=True)

model = als.fit(ratings)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="FREQUENCY",
                                predictionCol="prediction")

rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

## Parameters of ALS Model in PySpark realization are following:

##### NumBlocks is the number of blocks the users and items will be partitioned into in order to parallelize computation.
##### rank is the number of latent factors in the model.
##### maxIter is the maximum number of iterations to run.
##### regParam specifies the regularization parameter in ALS.
##### implicitPrefs specifies whether to use the explicit feedback ALS variant or one adapted for implicit feedback data (defaults to false which means using explicit feedback).
##### alpha is a parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations (defaults to 1.0)

###  Generate top 10 Item recommendations for each user



In [None]:
userRecs = model.recommendForAllUsers(10)
userRecs.count()



In [None]:
userRecs.take(10)

### Display the recommendations and get them in the correct format

In [None]:
from pyspark.sql.functions import explode
userRecs1=userRecs.withColumn("recommendations", explode(userRecs.recommendations))
userRecs1.show()

####  Breaking down reach recommendation to separate columns

In [None]:
import select as s


In [None]:

userRecs1= userRecs1 \
  .select('ID_CTE', 'recommendations.*')    
   

### Display the results

In [None]:
userRecs1.show() 

### Writing the Output back to the Remote Datasource

In [None]:
new_table_name = 'RecommendationsResult'
userRecs1.coalesce(1).write \
   .format("jdbc") \
    .mode('overwrite') \
    .option("url", dataSource['URL']) \
    .option("dbtable", dataSet['schema']+"."+new_table_name) \
    .option("user", dataSource['user']) \
    .option("password", dataSource['password']) \
    .save()

In [None]:
from dsx_ml.ml import save


In [None]:
type(model)

In [None]:
from pyspark.ml import Pipeline

In [None]:
pipeline = Pipeline(stages=[model])

In [None]:
model_alsWML = pipeline.fit(ratings)

In [None]:
save(name = 'PySparkRecommenderPipeline',
     model = model_alsWML,
     test_data = ratings,
     algorithm_type = 'Classification',
     source='PySparkRecommenderWithWorkflow.ipynb',
     description='Recommender using PySpark')