In [1]:
import pyspark.sql.functions as sql_func
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark.ml.evaluation import RegressionEvaluator


spark = SparkSession(sc)

In [2]:
#final_stat = spark.read.csv('/user-home/libraries/raw_data/datasets/Base_rec.csv', header=True, sep='|')
final_stat = spark.read.csv('/user-home/libraries/Sampled_data/datasets/dec16-nov17_Jalisco.csv', header=True, sep=',')  #test for Jalisco for comparison (@DA)

final_stat.show(5)

+-------+-------+----+----------+
| ID_CTE|ID_FAM1|FREQ|YEAR-MONTH|
+-------+-------+----+----------+
|1369361|1106234|   1|2017-10-01|
|1421660|1862009|   1|2017-10-01|
|1453179|1102164|   1|2017-10-01|
|1455817|1854022|   1|2017-10-01|
|1459605|1310121|   1|2017-10-01|
+-------+-------+----+----------+
only showing top 5 rows



In [3]:
ratings = (final_stat
    .select(
        'ID_CTE',
        'ID_FAM1',
        'FREQ',
    )
).cache()

In [4]:
from pyspark.sql.functions import expr

ratings= ratings.withColumn("ID_CTE", expr("CAST(ID_CTE AS INTEGER)"))



In [5]:
ratings=ratings.withColumn("ID_FAM1", expr("CAST(ID_FAM1 AS INTEGER)"))

In [6]:
ratings=ratings.withColumn("FREQ", expr("CAST(FREQ AS INTEGER)"))

In [7]:
ratings.dtypes

[('ID_CTE', 'int'), ('ID_FAM1', 'int'), ('FREQ', 'int')]

In [8]:
(training, test) = ratings.randomSplit([0.8, 0.2])



In [9]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=10, regParam=0.01,rank=100, 
          userCol="ID_CTE", itemCol="ID_FAM1", ratingCol="FREQ",
          #coldStartStrategy="drop",
          implicitPrefs=False)

model = als.fit(ratings)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="FREQ",
                                predictionCol="prediction")

rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.00728276362509358


In [10]:
%%time
from pyspark.ml.recommendation import ALS, ALSModel



CPU times: user 6 µs, sys: 11 µs, total: 17 µs
Wall time: 21.7 µs


In [11]:
# Generate top 10 Item recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.count()

1902

In [12]:
from pyspark.sql.functions import explode
userRecs1=userRecs.withColumn("recommendations", explode(userRecs.recommendations))
userRecs1.show()

+--------+--------------------+
|  ID_CTE|     recommendations|
+--------+--------------------+
|40354130| [1282002,0.9973844]|
|40354130| [1401002,0.9152835]|
|40354130|[1594184,0.90924096]|
|40354130| [1105007,0.5333132]|
|40354130| [1775262,0.4785588]|
|40354130|[1291062,0.46936402]|
|40354130|[1863047,0.44844317]|
|40354130| [1594185,0.4394519]|
|40354130|[1290059,0.43123487]|
|40354130|[1432043,0.41803825]|
|30396781| [2224033,1.0009017]|
|30396781|[2295019,0.99868584]|
|30396781| [2229035,0.9914556]|
|30396781| [2299099,0.9599417]|
|30396781| [2229032,0.8849272]|
|30396781| [2224041,0.7363132]|
|30396781| [2224073,0.7150184]|
|30396781| [2229003,0.7150184]|
|30396781|[2295027,0.65394366]|
|30396781| [2295026,0.6280386]|
+--------+--------------------+
only showing top 20 rows



In [14]:
import select as s
#userRecs1=
userRecs1= userRecs1 \
  .select('ID_CTE', 'recommendations.*')

In [18]:
userRecs1.coalesce(1).write.format("csv").option("header", "true").save('/user-home/libraries/Sampled_data/datasets/JaliscoResult.csv')

