In [1]:
#!pip install pyspark
#!pip install --upgrade google-cloud-bigquery[pandas]
#!pip install pyspark[sql] #PARSEO RAPIDO DE PANDAS A SPARK RDDSQL

### Importing the libraries and starting the Spark Session

In [2]:
import pyspark.sql.functions as sql_func
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.context import SparkContext 
from pyspark.sql import SparkSession
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd

In [3]:
#!export ARROW_PRE_0_15_IPC_FORMAT=1
#!echo $ARROW_PRE_0_15_IPC_FORMAT

In [4]:
sc = SparkContext.getOrCreate()

In [5]:
spark = SparkSession(sc)
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [6]:
#from pyspark.context import 
spark = SparkSession(sc)

### Add asset from remote connection 

In [7]:
final_stat = None

In [8]:
# Get data from GCP- BQ
from google.cloud import bigquery
import time
t0 = time.time()

porcentaje = 1
limite = int(189857 * porcentaje)

def get_data_BQ(sql):
    client = bigquery.Client()
    df = client.query(sql).to_dataframe()
    return(df)
sql =  '''SELECT USERID as ID_CTE, ID_FAM as ID_CLAS1, FREQUENCY as FREQUENCY
FROM `rmf2gcp.RawData.Workflow_aggregado`
WHERE id_table_dem <= ''' + str(limite) #310 588 606 ''' # corre en mi local y pesa 56MB %1 del total de la muestra
print(sql)

SELECT USERID as ID_CTE, ID_FAM as ID_CLAS1, FREQUENCY as FREQUENCY
FROM `rmf2gcp.RawData.Workflow_aggregado`
WHERE id_table_dem <= 189857


In [9]:
final_stat = get_data_BQ(sql)

In [10]:
print(final_stat.dtypes)
print(final_stat.shape)

ID_CTE       int64
ID_CLAS1     int64
FREQUENCY    int64
dtype: object
(3817915, 3)


In [11]:
final_stat = spark.createDataFrame(final_stat)
final_stat.show(5)

  An error occurred while calling z:org.apache.spark.sql.api.python.PythonSQLUtils.readArrowStreamFromFile.
: java.lang.IllegalArgumentException
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:334)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessage(MessageSerializer.java:543)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$3.readNextBatch(ArrowConverters.scala:243)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$3.<init>(ArrowConverters.scala:229)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$.getBatchesFromStream(ArrowConverters.scala:228)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$.$anonfun$readArrowStreamFromFile$2(ArrowConverters.scala:216)
	at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2543)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$.readArrowStreamFromFile(ArrowConverters.scala:214)
	at org.apache.spark.sql.api.python.PythonSQLUtils$.readArrowStreamFromFile(PythonSQLUtils.scala

+------+--------+---------+
|ID_CTE|ID_CLAS1|FREQUENCY|
+------+--------+---------+
|945723|  314156|        7|
|550294|  319062|        6|
|262907|  314064|        6|
|681339|  224009|        6|
|191358|  313152|        7|
+------+--------+---------+
only showing top 5 rows



In [12]:
final_stat.count()
print(type(final_stat))

<class 'pyspark.sql.dataframe.DataFrame'>


### Preparing data for the model

In [13]:
ratings = (final_stat
    .select(
        'ID_CTE',
        'ID_CLAS1',
        'FREQUENCY',
    )
).cache()

### Spliting the data set to test and train for measuring the performance of the ALS Model

In [14]:
(training, test) = ratings.randomSplit([0.8, 0.2])

### Build the recommendation model using ALS on the training data


In [15]:
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=2, regParam=0.01, 
          userCol="ID_CTE", itemCol="ID_CLAS1", ratingCol="FREQUENCY",
          coldStartStrategy="drop",
          implicitPrefs=True)

model = als.fit(ratings)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="FREQUENCY",
                                predictionCol="prediction")

rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.5968087153882395


In [16]:
predictions.show(5)

+------+--------+---------+------------+
|ID_CTE|ID_CLAS1|FREQUENCY|  prediction|
+------+--------+---------+------------+
|408865|  212010|        1| 3.639931E-4|
|144771|  432399|        1| 0.002300021|
|652912|  432399|        1| 0.002555012|
|343446|  432399|        1|  0.00508264|
|837641|  432399|        2|0.0029049083|
+------+--------+---------+------------+
only showing top 5 rows



## Parameters of ALS Model in PySpark realization are following:

##### NumBlocks is the number of blocks the users and items will be partitioned into in order to parallelize computation.
##### rank is the number of latent factors in the model.
##### maxIter is the maximum number of iterations to run.
##### regParam specifies the regularization parameter in ALS.
##### implicitPrefs specifies whether to use the explicit feedback ALS variant or one adapted for implicit feedback data (defaults to false which means using explicit feedback).
##### alpha is a parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations (defaults to 1.0)

###  Generate top 10 Item recommendations for each user



In [17]:
userRecs = model.recommendForAllUsers(10)
print(userRecs.count())
userRecs.show(5)

189857
+------+--------------------+
|ID_CTE|     recommendations|
+------+--------------------+
| 10362|[[318073, 0.09074...|
| 11033|[[701305, 0.65940...|
| 11141|[[102016, 0.27780...|
| 12940|[[313152, 0.74259...|
| 13832|[[318009, 0.51142...|
+------+--------------------+
only showing top 5 rows



In [18]:
userRecs.take(2)

[Row(ID_CTE=10362, recommendations=[Row(ID_CLAS1=318073, rating=0.09074781835079193), Row(ID_CLAS1=224009, rating=0.08243425190448761), Row(ID_CLAS1=862009, rating=0.07791068404912949), Row(ID_CLAS1=318203, rating=0.06855244934558868), Row(ID_CLAS1=229032, rating=0.05686267465353012), Row(ID_CLAS1=106003, rating=0.05195869132876396), Row(ID_CLAS1=106001, rating=0.05104609951376915), Row(ID_CLAS1=701305, rating=0.04754101485013962), Row(ID_CLAS1=229011, rating=0.04433450847864151), Row(ID_CLAS1=101028, rating=0.04307369515299797)]),
 Row(ID_CTE=11033, recommendations=[Row(ID_CLAS1=701305, rating=0.6594046950340271), Row(ID_CLAS1=862009, rating=0.5082660913467407), Row(ID_CLAS1=381009, rating=0.4194330871105194), Row(ID_CLAS1=423132, rating=0.34522974491119385), Row(ID_CLAS1=318009, rating=0.3161504566669464), Row(ID_CLAS1=319059, rating=0.31468671560287476), Row(ID_CLAS1=380073, rating=0.30177468061447144), Row(ID_CLAS1=380283, rating=0.30172455310821533), Row(ID_CLAS1=290059, rating=0.

In [19]:
userRecs[['recommendations']].show()

+--------------------+
|     recommendations|
+--------------------+
|[[318073, 0.09074...|
|[[701305, 0.65940...|
|[[102016, 0.27780...|
|[[313152, 0.74259...|
|[[318009, 0.51142...|
|[[862009, 0.54229...|
|[[701305, 0.64938...|
|[[102011, 0.25774...|
|[[295019, 0.55509...|
|[[701305, 1.21820...|
|[[314156, 0.54998...|
|[[318073, 0.71593...|
|[[106010, 0.08245...|
|[[102016, 0.24529...|
|[[862009, 0.46045...|
|[[224009, 0.67586...|
|[[314063, 0.59726...|
|[[106055, 0.22423...|
|[[862009, 1.05049...|
|[[862009, 0.63242...|
+--------------------+
only showing top 20 rows



In [20]:
1

1

### Display the recommendations and get them in the correct format

In [21]:
from pyspark.sql.functions import explode
userRecs1=userRecs.withColumn("recommendations", explode(userRecs.recommendations))
userRecs1.show(4)

+------+--------------------+
|ID_CTE|     recommendations|
+------+--------------------+
| 10362|[318073, 0.09074782]|
| 10362|[224009, 0.08243425]|
| 10362|[862009, 0.077910...|
| 10362|[318203, 0.06855245]|
+------+--------------------+
only showing top 4 rows



####  Breaking down reach recommendation to separate columns

In [22]:
userRecs1= userRecs1.select('ID_CTE', 'recommendations.*')       

### Display the results

In [23]:
userRecs1.show(2) 

+------+--------+----------+
|ID_CTE|ID_CLAS1|    rating|
+------+--------+----------+
| 10362|  318073|0.09074782|
| 10362|  224009|0.08243425|
+------+--------+----------+
only showing top 2 rows



In [24]:
userRecs1.count()

1898570

### Writing the Output back to the Remote Datasource

In [25]:
final_stat = userRecs1.toPandas()
userRecs1.unpersist(True)

DataFrame[ID_CTE: int, ID_CLAS1: int, rating: float]

In [26]:
!pip install pandas_gbq

Collecting pandas_gbq
  Downloading https://files.pythonhosted.org/packages/53/f3/3100eb9332c62c5e5ac486d5421965da23a0b92012825bfbb372b7f8d508/pandas_gbq-0.13.2-py3-none-any.whl
Collecting pydata-google-auth (from pandas_gbq)
  Downloading https://files.pythonhosted.org/packages/0b/dc/be321b769b761ec2640f1e4561c2953dd6a4a3efe6b10b5781774c71177a/pydata_google_auth-1.1.0-py2.py3-none-any.whl
Installing collected packages: pydata-google-auth, pandas-gbq
Successfully installed pandas-gbq-0.13.2 pydata-google-auth-1.1.0


In [27]:
table_id = 'Resultados.test_spark_0'+str(porcentaje)+'porciento_17_junio_2020'
table_id

'Resultados.test_spark_01porciento_17_junio_2020'

In [28]:
final_stat.to_gbq(table_id, project_id='rmf2gcp')
t3 = time.time()
total = t3-t0
print(total)

1it [00:31, 31.58s/it]

424.23178482055664





In [29]:
#!mkdir test/

In [30]:
#final_stat.to_csv('test_spark_0'+str(porcentaje)+'porciento_17_junio_2020')

In [31]:
#!gsutil cp test_gcp_cluster_10_junio_2020.csv gs://resultadosrmf2/prueba_gcp_01porciento/test_local_10_junio_2020.csv

In [32]:
#!zip test_gcp_cluster_10_junio_2020.csv.zip test_gcp_cluster_10_junio_2020.csv

In [33]:
#!ls

In [34]:
#!gsutil cp test_gcp_cluster_10_junio_2020.csv.zip gs://resultadosrmf2/prueba_gcp_01porciento/test_local_10_junio_2020.csv.zip

In [35]:
##!rm -r test_modelos
#!mkdir test_modelos_gcp
#!chmod 777 test_modelos_gcp

In [36]:
#from pyspark.ml import Pipeline

In [37]:
#pipeline = Pipeline(stages=[model])

In [38]:
#model_alsWML = pipeline.fit(ratings)

In [39]:
#model_alsWML.save('/test_modelos_gcp/')

In [40]:
#!ls -la

In [41]:
#!gsutil cp test_gcp_cluster_10_junio_2020.csv.zip gs://resultadosrmf2/prueba_gcp_01porciento/test_local_10_junio_2020.csv.zip