In [None]:
#!pip install pyspark
#!pip install --upgrade google-cloud-bigquery[pandas]
#!pip install pyspark[sql] #PARSEO RAPIDO DE PANDAS A SPARK RDDSQL

### Importing the libraries and starting the Spark Session

In [None]:
import pyspark.sql.functions as sql_func
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.context import SparkContext 
from pyspark.sql import SparkSession
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd

In [None]:
#!export ARROW_PRE_0_15_IPC_FORMAT=1
#!echo $ARROW_PRE_0_15_IPC_FORMAT

In [None]:
sc = SparkContext.getOrCreate()

In [None]:
spark = SparkSession(sc)
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [None]:
#from pyspark.context import 
spark = SparkSession(sc)

### Add asset from remote connection 

In [None]:
final_stat = None

In [None]:
# Get data from GCP- BQ
from google.cloud import bigquery
import time
t0 = time.time()

porcentaje = 5
limite = int(189857 * porcentaje)

def get_data_BQ(sql):
    client = bigquery.Client()
    df = client.query(sql).to_dataframe()
    return(df)
sql =  '''SELECT USERID as ID_CTE, ID_FAM as ID_CLAS1, FREQUENCY as FREQUENCY
FROM `rmf2gcp.RawData.Workflow_aggregado`
WHERE id_table_dem <= ''' + str(limite) #310 588 606 ''' # corre en mi local y pesa 56MB %1 del total de la muestra
print(sql)

In [None]:
final_stat = get_data_BQ(sql)

In [None]:
print(final_stat.dtypes)
print(final_stat.shape)

In [None]:
final_stat = spark.createDataFrame(final_stat)
final_stat.show(5)

In [None]:
final_stat.count()
print(type(final_stat))

### Preparing data for the model

In [None]:
ratings = (final_stat
    .select(
        'ID_CTE',
        'ID_CLAS1',
        'FREQUENCY',
    )
).cache()

### Spliting the data set to test and train for measuring the performance of the ALS Model

In [None]:
(training, test) = ratings.randomSplit([0.8, 0.2])

### Build the recommendation model using ALS on the training data


In [None]:
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=2, regParam=0.01, 
          userCol="ID_CTE", itemCol="ID_CLAS1", ratingCol="FREQUENCY",
          coldStartStrategy="drop",
          implicitPrefs=True)

model = als.fit(ratings)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="FREQUENCY",
                                predictionCol="prediction")

rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

In [None]:
predictions.show(5)

## Parameters of ALS Model in PySpark realization are following:

##### NumBlocks is the number of blocks the users and items will be partitioned into in order to parallelize computation.
##### rank is the number of latent factors in the model.
##### maxIter is the maximum number of iterations to run.
##### regParam specifies the regularization parameter in ALS.
##### implicitPrefs specifies whether to use the explicit feedback ALS variant or one adapted for implicit feedback data (defaults to false which means using explicit feedback).
##### alpha is a parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations (defaults to 1.0)

###  Generate top 10 Item recommendations for each user



In [None]:
userRecs = model.recommendForAllUsers(10)
print(userRecs.count())
userRecs.show(5)

In [None]:
userRecs.take(2)

In [None]:
userRecs[['recommendations']].show()

In [None]:
1

### Display the recommendations and get them in the correct format

In [None]:
from pyspark.sql.functions import explode
userRecs1=userRecs.withColumn("recommendations", explode(userRecs.recommendations))
userRecs1.show(4)

####  Breaking down reach recommendation to separate columns

In [None]:
userRecs1= userRecs1.select('ID_CTE', 'recommendations.*')       

### Display the results

In [None]:
userRecs1.show(2) 

In [None]:
userRecs1.count()

### Writing the Output back to the Remote Datasource

In [None]:
final_stat = userRecs1.toPandas()
userRecs1.unpersist(True)

In [None]:
!pip install pandas_gbq

In [None]:
table_id = 'Resultados.test_spark_0'+str(porcentaje)+'porciento_17_junio_2020'
table_id

In [None]:
final_stat.to_gbq(table_id, project_id='rmf2gcp')
t3 = time.time()
total = t3-t0
print(total)

In [None]:
#!mkdir test/

In [None]:
#final_stat.to_csv('test_spark_0'+str(porcentaje)+'porciento_17_junio_2020')

In [None]:
#!gsutil cp test_gcp_cluster_10_junio_2020.csv gs://resultadosrmf2/prueba_gcp_01porciento/test_local_10_junio_2020.csv

In [None]:
#!zip test_gcp_cluster_10_junio_2020.csv.zip test_gcp_cluster_10_junio_2020.csv

In [None]:
#!ls

In [None]:
#!gsutil cp test_gcp_cluster_10_junio_2020.csv.zip gs://resultadosrmf2/prueba_gcp_01porciento/test_local_10_junio_2020.csv.zip

In [None]:
##!rm -r test_modelos
#!mkdir test_modelos_gcp
#!chmod 777 test_modelos_gcp

In [None]:
#from pyspark.ml import Pipeline

In [None]:
#pipeline = Pipeline(stages=[model])

In [None]:
#model_alsWML = pipeline.fit(ratings)

In [None]:
#model_alsWML.save('/test_modelos_gcp/')

In [None]:
#!ls -la

In [None]:
#!gsutil cp test_gcp_cluster_10_junio_2020.csv.zip gs://resultadosrmf2/prueba_gcp_01porciento/test_local_10_junio_2020.csv.zip