In [1]:
#!pip install pyspark
#!pip install --upgrade google-cloud-bigquery[pandas]
#!pip install pyspark[sql] #PARSEO RAPIDO DE PANDAS A SPARK RDDSQL

### Importing the libraries and starting the Spark Session

In [2]:
import pyspark.sql.functions as sql_func
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.context import SparkContext 
from pyspark.sql import SparkSession
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd

In [3]:
#!export ARROW_PRE_0_15_IPC_FORMAT=1
#!echo $ARROW_PRE_0_15_IPC_FORMAT

In [4]:
sc = SparkContext.getOrCreate()

In [5]:
spark = SparkSession(sc)
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [6]:
#from pyspark.context import 
spark = SparkSession(sc)

### Add asset from remote connection 

In [7]:
final_stat = None

In [8]:
# Get data from GCP- BQ
from google.cloud import bigquery
import time
t0 = time.time()

porcentaje = 5
limite = int(189857 * porcentaje)

def get_data_BQ(sql):
    client = bigquery.Client()
    df = client.query(sql).to_dataframe()
    return(df)
sql =  '''SELECT USERID as ID_CTE, ID_FAM as ID_CLAS1, FREQUENCY as FREQUENCY
FROM `rmf2gcp.RawData.Workflow_aggregado`
WHERE id_table_dem <= ''' + str(limite) #310 588 606 ''' # corre en mi local y pesa 56MB %1 del total de la muestra
print(sql)

SELECT USERID as ID_CTE, ID_FAM as ID_CLAS1, FREQUENCY as FREQUENCY
FROM `rmf2gcp.RawData.Workflow_aggregado`
WHERE id_table_dem <= 949285


In [None]:
final_stat = get_data_BQ(sql)

In [None]:
print(final_stat.dtypes)
print(final_stat.shape)

ID_CTE       int64
ID_CLAS1     int64
FREQUENCY    int64
dtype: object
(18297493, 3)


In [None]:
final_stat = spark.createDataFrame(final_stat)
final_stat.show(5)

  An error occurred while calling z:org.apache.spark.sql.api.python.PythonSQLUtils.readArrowStreamFromFile.
: java.lang.IllegalArgumentException
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:334)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessage(MessageSerializer.java:543)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$3.readNextBatch(ArrowConverters.scala:243)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$3.<init>(ArrowConverters.scala:229)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$.getBatchesFromStream(ArrowConverters.scala:228)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$.$anonfun$readArrowStreamFromFile$2(ArrowConverters.scala:216)
	at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2543)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$.readArrowStreamFromFile(ArrowConverters.scala:214)
	at org.apache.spark.sql.api.python.PythonSQLUtils$.readArrowStreamFromFile(PythonSQLUtils.scala

+-------+--------+---------+
| ID_CTE|ID_CLAS1|FREQUENCY|
+-------+--------+---------+
| 179537|  418276|        6|
| 330344|  102089|        6|
|3998222|  863047|        9|
|4484634|  102164|        6|
|4347906|  314064|        8|
+-------+--------+---------+
only showing top 5 rows



In [None]:
final_stat.count()
print(type(final_stat))

<class 'pyspark.sql.dataframe.DataFrame'>


### Preparing data for the model

In [None]:
ratings = (final_stat
    .select(
        'ID_CTE',
        'ID_CLAS1',
        'FREQUENCY',
    )
).cache()

### Spliting the data set to test and train for measuring the performance of the ALS Model

In [None]:
(training, test) = ratings.randomSplit([0.8, 0.2])

### Build the recommendation model using ALS on the training data


In [None]:
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=2, regParam=0.01, 
          userCol="ID_CTE", itemCol="ID_CLAS1", ratingCol="FREQUENCY",
          coldStartStrategy="drop",
          implicitPrefs=True)

model = als.fit(ratings)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="FREQUENCY",
                                predictionCol="prediction")

rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.5660879622147785


In [None]:
predictions.show(5)

+-------+--------+---------+------------+
| ID_CTE|ID_CLAS1|FREQUENCY|  prediction|
+-------+--------+---------+------------+
| 182609|  212010|        1| 6.243836E-4|
|1170487|  212010|        1| 4.522957E-4|
|2916818|  212010|        1|4.1393048E-4|
|1784655|  212010|        1| 3.915355E-6|
| 340914|  212010|        1|1.5606423E-4|
+-------+--------+---------+------------+
only showing top 5 rows



## Parameters of ALS Model in PySpark realization are following:

##### NumBlocks is the number of blocks the users and items will be partitioned into in order to parallelize computation.
##### rank is the number of latent factors in the model.
##### maxIter is the maximum number of iterations to run.
##### regParam specifies the regularization parameter in ALS.
##### implicitPrefs specifies whether to use the explicit feedback ALS variant or one adapted for implicit feedback data (defaults to false which means using explicit feedback).
##### alpha is a parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations (defaults to 1.0)

###  Generate top 10 Item recommendations for each user



In [None]:
userRecs = model.recommendForAllUsers(10)
print(userRecs.count())
userRecs.show(5)

949285
+------+--------------------+
|ID_CTE|     recommendations|
+------+--------------------+
| 10362|[[380283, 0.11308...|
| 11033|[[319059, 0.37997...|
| 11141|[[105068, 0.38442...|
| 12940|[[380283, 0.81013...|
| 13832|[[318009, 0.57493...|
+------+--------------------+
only showing top 5 rows



In [None]:
userRecs.take(2)

[Row(ID_CTE=10362, recommendations=[Row(ID_CLAS1=380283, rating=0.11308000981807709), Row(ID_CLAS1=670015, rating=0.09247514605522156), Row(ID_CLAS1=860049, rating=0.08445228636264801), Row(ID_CLAS1=314156, rating=0.0811886265873909), Row(ID_CLAS1=862009, rating=0.07975541800260544), Row(ID_CLAS1=224009, rating=0.07354674488306046), Row(ID_CLAS1=101028, rating=0.06908357888460159), Row(ID_CLAS1=701305, rating=0.0670555979013443), Row(ID_CLAS1=860048, rating=0.0660824403166771), Row(ID_CLAS1=315131, rating=0.06450476497411728)]),
 Row(ID_CTE=11033, recommendations=[Row(ID_CLAS1=319059, rating=0.3799772560596466), Row(ID_CLAS1=701305, rating=0.3051292896270752), Row(ID_CLAS1=314063, rating=0.27488651871681213), Row(ID_CLAS1=313152, rating=0.2649444341659546), Row(ID_CLAS1=318203, rating=0.2637958228588104), Row(ID_CLAS1=318073, rating=0.24786736071109772), Row(ID_CLAS1=380073, rating=0.24770447611808777), Row(ID_CLAS1=318009, rating=0.23988865315914154), Row(ID_CLAS1=317073, rating=0.220

In [None]:
userRecs[['recommendations']].show()

+--------------------+
|     recommendations|
+--------------------+
|[[380283, 0.11308...|
|[[319059, 0.37997...|
|[[105068, 0.38442...|
|[[380283, 0.81013...|
|[[318009, 0.57493...|
|[[860048, 0.41832...|
|[[318009, 0.43979...|
|[[224009, 0.27071...|
|[[224009, 0.54601...|
|[[313152, 1.26762...|
|[[313152, 0.60337...|
|[[860048, 0.71588...|
|[[862009, 0.16757...|
|[[102016, 0.20113...|
|[[860048, 0.39464...|
|[[229032, 0.65287...|
|[[313152, 0.64600...|
|[[106055, 0.26870...|
|[[319059, 0.95158...|
|[[224009, 0.68381...|
+--------------------+
only showing top 20 rows



In [None]:
1

1

### Display the recommendations and get them in the correct format

In [None]:
from pyspark.sql.functions import explode
userRecs1=userRecs.withColumn("recommendations", explode(userRecs.recommendations))
userRecs1.show(4)

+------+--------------------+
|ID_CTE|     recommendations|
+------+--------------------+
| 10362|[380283, 0.11308001]|
| 10362|[670015, 0.092475...|
| 10362|[860049, 0.08445229]|
| 10362|[314156, 0.08118863]|
+------+--------------------+
only showing top 4 rows



####  Breaking down reach recommendation to separate columns

In [None]:
userRecs1= userRecs1.select('ID_CTE', 'recommendations.*')       

### Display the results

In [None]:
userRecs1.show(2) 

+------+--------+-----------+
|ID_CTE|ID_CLAS1|     rating|
+------+--------+-----------+
| 10362|  380283| 0.11308001|
| 10362|  670015|0.092475146|
+------+--------+-----------+
only showing top 2 rows



In [None]:
userRecs1.count()

9492850

### Writing the Output back to the Remote Datasource

In [None]:
final_stat = userRecs1.toPandas()
userRecs1.unpersist(True)

DataFrame[ID_CTE: int, ID_CLAS1: int, rating: float]

In [None]:
#!pip install pandas_gbq



In [None]:
table_id = 'Resultados.test_spark_0'+str(porcentaje)+'porciento_17_junio_2020'
table_id

'Resultados.test_spark_05porciento_17_junio_2020'

In [None]:
final_stat.to_gbq(table_id, project_id='rmf2gcp')
t3 = time.time()
total = t3-t0
print(total)

1it [01:58, 118.90s/it]

2610.5999677181244





In [None]:
#!mkdir test/

In [None]:
#final_stat.to_csv('test_spark_0'+str(porcentaje)+'porciento_17_junio_2020')

In [None]:
#!gsutil cp test_gcp_cluster_10_junio_2020.csv gs://resultadosrmf2/prueba_gcp_01porciento/test_local_10_junio_2020.csv

In [None]:
#!zip test_gcp_cluster_10_junio_2020.csv.zip test_gcp_cluster_10_junio_2020.csv

In [None]:
#!ls

In [None]:
#!gsutil cp test_gcp_cluster_10_junio_2020.csv.zip gs://resultadosrmf2/prueba_gcp_01porciento/test_local_10_junio_2020.csv.zip

In [None]:
##!rm -r test_modelos
#!mkdir test_modelos_gcp
#!chmod 777 test_modelos_gcp

In [None]:
#from pyspark.ml import Pipeline

In [None]:
#pipeline = Pipeline(stages=[model])

In [None]:
#model_alsWML = pipeline.fit(ratings)

In [None]:
#model_alsWML.save('/test_modelos_gcp/')

In [None]:
#!ls -la

In [None]:
#!gsutil cp test_gcp_cluster_10_junio_2020.csv.zip gs://resultadosrmf2/prueba_gcp_01porciento/test_local_10_junio_2020.csv.zip