# Workflow demonstration with a recommender engine on a sampled dataset from Transactions.csv using ALS Model
### This is the notebook for deployment

### Importing the libraries and starting the Spark Session

In [1]:
import pyspark.sql.functions as sql_func
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.context import SparkContext 
from pyspark.sql import SparkSession
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd

In [2]:
#!export ARROW_PRE_0_15_IPC_FORMAT=1
!echo $ARROW_PRE_0_15_IPC_FORMAT




In [3]:
sc = SparkContext.getOrCreate()

In [4]:
spark = SparkSession(sc)
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [5]:
#from pyspark.context import 
spark = SparkSession(sc)

### Add asset from remote connection 

In [6]:
final_stat = None

In [7]:
# Get data from GCP- BQ
from google.cloud import bigquery
def get_data_BQ(sql):
    client = bigquery.Client()
    df = client.query(sql).to_dataframe()
    return(df)
sql =  '''SELECT ID_CTE as ID_CTE, ID_FAM as ID_CLAS1, FREQUENCY as FREQUENCY
FROM `rmf2gcp.RawData.Workflow_aggregado`
limit 3105886#310 588 606 ''' # corre en mi local y pesa 56MB %1 del total de la muestra

In [8]:
final_stat = get_data_BQ(sql)

In [9]:
print(final_stat.dtypes)
print(final_stat.shape)

ID_CTE       int64
ID_CLAS1     int64
FREQUENCY    int64
dtype: object
(3105886, 3)


In [10]:
final_stat = spark.createDataFrame(final_stat)
final_stat.show(5)

  An error occurred while calling z:org.apache.spark.sql.api.python.PythonSQLUtils.readArrowStreamFromFile.
: java.lang.IllegalArgumentException
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:334)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessage(MessageSerializer.java:543)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$3.readNextBatch(ArrowConverters.scala:243)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$3.<init>(ArrowConverters.scala:229)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$.getBatchesFromStream(ArrowConverters.scala:228)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$.$anonfun$readArrowStreamFromFile$2(ArrowConverters.scala:216)
	at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2543)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$.readArrowStreamFromFile(ArrowConverters.scala:214)
	at org.apache.spark.sql.api.python.PythonSQLUtils$.readArrowStreamFromFile(PythonSQLUtils.scala

+--------+--------+---------+
|  ID_CTE|ID_CLAS1|FREQUENCY|
+--------+--------+---------+
| 8913174|  856047|        6|
|12064659|  380284|        6|
|  879771|  319064|        8|
| 7522981|  224025|        6|
|14727490|  224057|        7|
+--------+--------+---------+
only showing top 5 rows



In [11]:
final_stat.count()
print(type(final_stat))

<class 'pyspark.sql.dataframe.DataFrame'>


### Preparing data for the model

In [12]:
ratings = (final_stat
    .select(
        'ID_CTE',
        'ID_CLAS1',
        'FREQUENCY',
    )
).cache()

### Spliting the data set to test and train for measuring the performance of the ALS Model

In [13]:
(training, test) = ratings.randomSplit([0.8, 0.2])

### Build the recommendation model using ALS on the training data


In [14]:
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=2, regParam=0.01, 
          userCol="ID_CTE", itemCol="ID_CLAS1", ratingCol="FREQUENCY",
          coldStartStrategy="drop",
          implicitPrefs=True)

model = als.fit(ratings)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="FREQUENCY",
                                predictionCol="prediction")

rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 2.1386147468410406


In [15]:
predictions.show(5)

+--------+--------+---------+------------+
|  ID_CTE|ID_CLAS1|FREQUENCY|  prediction|
+--------+--------+---------+------------+
| 1397905|  212010|        1|2.4343958E-9|
|35269436|  212010|        1|2.4343958E-9|
|40116259|  212010|        1|2.4343958E-9|
|13853352|  212010|        1|2.4343958E-9|
|37916553|  212010|        1|2.4343958E-9|
+--------+--------+---------+------------+
only showing top 5 rows



## Parameters of ALS Model in PySpark realization are following:

##### NumBlocks is the number of blocks the users and items will be partitioned into in order to parallelize computation.
##### rank is the number of latent factors in the model.
##### maxIter is the maximum number of iterations to run.
##### regParam specifies the regularization parameter in ALS.
##### implicitPrefs specifies whether to use the explicit feedback ALS variant or one adapted for implicit feedback data (defaults to false which means using explicit feedback).
##### alpha is a parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations (defaults to 1.0)

###  Generate top 10 Item recommendations for each user



In [16]:
userRecs = model.recommendForAllUsers(10)
print(userRecs.count())
userRecs.show(5)

2649377
+------+--------------------+
|ID_CTE|     recommendations|
+------+--------------------+
| 13832|[[105001, 0.01867...|
| 18654|[[224025, 4.90615...|
| 22097|[[313096, 0.07093...|
| 36525|[[323073, 0.18571...|
| 41751|[[314063, 0.01039...|
+------+--------------------+
only showing top 5 rows



In [17]:
userRecs.take(2)

[Row(ID_CTE=13832, recommendations=[Row(ID_CLAS1=105001, rating=0.018674740567803383), Row(ID_CLAS1=106061, rating=0.017564326524734497), Row(ID_CLAS1=318009, rating=0.01661280170083046), Row(ID_CLAS1=313155, rating=0.016564249992370605), Row(ID_CLAS1=102164, rating=0.015708401799201965), Row(ID_CLAS1=105071, rating=0.013289724476635456), Row(ID_CLAS1=413214, rating=0.012968757189810276), Row(ID_CLAS1=224017, rating=0.01178194023668766), Row(ID_CLAS1=856230, rating=0.010681500658392906), Row(ID_CLAS1=101027, rating=0.010500011965632439)]),
 Row(ID_CTE=18654, recommendations=[Row(ID_CLAS1=224025, rating=0.0004906158428639174), Row(ID_CLAS1=224065, rating=0.00044970453018322587), Row(ID_CLAS1=105007, rating=0.0003991244302596897), Row(ID_CLAS1=381009, rating=0.0003856293042190373), Row(ID_CLAS1=105074, rating=0.0003786047163885087), Row(ID_CLAS1=701305, rating=0.00036178837763145566), Row(ID_CLAS1=314156, rating=0.0003468487993814051), Row(ID_CLAS1=106055, rating=0.00033687823452055454),

In [18]:
userRecs[['recommendations']].show()

+--------------------+
|     recommendations|
+--------------------+
|[[105001, 0.01867...|
|[[224025, 4.90615...|
|[[313096, 0.07093...|
|[[323073, 0.18571...|
|[[314063, 0.01039...|
|[[701305, 0.00194...|
|[[314063, 0.62439...|
|[[313096, 0.00984...|
|[[290059, 0.03170...|
|[[314063, 0.32990...|
|[[106059, 0.11584...|
|[[319064, 0.30891...|
|[[313155, 0.00674...|
|[[314129, 1.02811...|
|[[106001, 0.08439...|
|[[313155, 0.00290...|
|[[106059, 0.06469...|
|[[314129, 0.00993...|
|[[314129, 0.08076...|
|[[291059, 0.00213...|
+--------------------+
only showing top 20 rows



In [19]:
1

1

### Display the recommendations and get them in the correct format

In [20]:
from pyspark.sql.functions import explode
userRecs1=userRecs.withColumn("recommendations", explode(userRecs.recommendations))
userRecs1.show(4)

+------+--------------------+
|ID_CTE|     recommendations|
+------+--------------------+
| 13832|[105001, 0.01867474]|
| 13832|[106061, 0.017564...|
| 13832|[318009, 0.016612...|
| 13832|[313155, 0.01656425]|
+------+--------------------+
only showing top 4 rows



####  Breaking down reach recommendation to separate columns

In [21]:
userRecs1= userRecs1.select('ID_CTE', 'recommendations.*')       

### Display the results

In [22]:
userRecs1.show(2) 

+------+--------+-----------+
|ID_CTE|ID_CLAS1|     rating|
+------+--------+-----------+
| 13832|  105001| 0.01867474|
| 13832|  106061|0.017564327|
+------+--------+-----------+
only showing top 2 rows



In [None]:
userRecs1.count()

26493770

### Writing the Output back to the Remote Datasource

In [None]:
final_stat = userRecs1.toPandas()
userRecs1.unpersist(True)

DataFrame[ID_CTE: int, ID_CLAS1: int, rating: float]

In [26]:
!pip install pandas_gbq

Collecting pandas_gbq
  Downloading https://files.pythonhosted.org/packages/53/f3/3100eb9332c62c5e5ac486d5421965da23a0b92012825bfbb372b7f8d508/pandas_gbq-0.13.2-py3-none-any.whl
Collecting pydata-google-auth (from pandas_gbq)
  Downloading https://files.pythonhosted.org/packages/0b/dc/be321b769b761ec2640f1e4561c2953dd6a4a3efe6b10b5781774c71177a/pydata_google_auth-1.1.0-py2.py3-none-any.whl
Installing collected packages: pydata-google-auth, pandas-gbq
Successfully installed pandas-gbq-0.13.2 pydata-google-auth-1.1.0


In [32]:
table_id = 'Resultados.test_gcp_cluster_10_junio_2020'

In [33]:
final_stat.to_gbq(table_id, project_id='rmf2gcp')

1it [05:46, 346.94s/it]


In [51]:
!mkdir test/

In [52]:
final_stat.to_csv('test_gcp_cluster_10_junio_2020.csv')

In [55]:
!gsutil cp test_gcp_cluster_10_junio_2020.csv gs://resultadosrmf2/prueba_gcp_01porciento/test_local_10_junio_2020.csv

Copying file://test_gcp_cluster_10_junio_2020.csv [Content-Type=text/csv]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

- [1 files][919.4 MiB/919.4 MiB]   63.7 MiB/s                                   
Operation completed over 1 objects/919.4 MiB.                                    


In [None]:
!zip test_gcp_cluster_10_junio_2020.csv.zip test_gcp_cluster_10_junio_2020.csv

  adding: test_gcp_cluster_10_junio_2020.csv (deflated 70%)


In [None]:
!ls

bin		root
boot		run
copyright	sbin
dev		snap
etc		sparkmonitor_kernelextension.log
hadoop		srv
home		sys
initrd.img	test
initrd.img.old	test_gcp_cluster_10_junio_2020.csv
lib		test_gcp_cluster_10_junio_2020.csv.zip
lib64		tmp
lost+found	usr
media		var
mnt		vmlinuz
opt		vmlinuz.old
proc


In [58]:
1

1

In [59]:
!gsutil cp test_gcp_cluster_10_junio_2020.csv.zip gs://resultadosrmf2/prueba_gcp_01porciento/test_local_10_junio_2020.csv.zip

Copying file://test_gcp_cluster_10_junio_2020.csv.zip [Content-Type=application/zip]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/ [1 files][273.1 MiB/273.1 MiB]                                                
Operation completed over 1 objects/273.1 MiB.                                    


In [105]:
#!rm -r test_modelos
!mkdir test_modelos_gcp
!chmod 777 test_modelos_gcp

In [106]:
from pyspark.ml import Pipeline

In [107]:
pipeline = Pipeline(stages=[model])

In [108]:
model_alsWML = pipeline.fit(ratings)

In [109]:
model_alsWML.save('/test_modelos_gcp/')

In [110]:
!ls -la

total 1221244
drwxr-xr-x  29 root root        4096 Jun 10 12:03 .
drwxr-xr-x  29 root root        4096 Jun 10 12:03 ..
drwx------   3 root root        4096 Jun 10 09:48 .config
drwxr-xr-x   2 root root        4096 May 28 05:01 bin
drwxr-xr-x   4 root root        4096 May 28 04:59 boot
-rw-r--r--   1 root root         646 Sep 10  2019 copyright
drwxr-xr-x  16 root root        3880 Jun 10 09:22 dev
drwxr-xr-x 120 root root       12288 Jun 10 09:23 etc
drwxr-xr-x   2 root root        4096 Jun 10 11:57 foo
drwxrwxr-x   7 root hadoop      4096 Jun 10 09:22 hadoop
drwxr-xr-x   3 root root        4096 Jun 10 09:22 home
lrwxrwxrwx   1 root root          30 May 21 17:47 initrd.img -> boot/initrd.img-5.3.0-1020-gcp
lrwxrwxrwx   1 root root          30 May 21 17:47 initrd.img.old -> boot/initrd.img-5.3.0-1020-gcp
drwxr-xr-x  22 root root        4096 May 28 05:07 lib
drwxr-xr-x   2 root root        4096 May 21 17:39 lib64
drwx------   2 root root       16384 May 21 17:45 lost+found
drwxr-xr-x   2 

In [None]:
!gsutil cp test_gcp_cluster_10_junio_2020.csv.zip gs://resultadosrmf2/prueba_gcp_01porciento/test_local_10_junio_2020.csv.zip