In [1]:
import pyspark
from pyspark.sql.types import *
import pyspark.ml.recommendation
from pyspark.ml.recommendation import *
import pyspark.ml.evaluation
from pyspark.ml.evaluation import *
import numpy as np

In [2]:
sc, spark

(<pyspark.context.SparkContext at 0x1006768d0>,
 <pyspark.sql.session.SparkSession at 0x1072f6750>)

In [3]:
# If you don't have the tree command, you can install it with:
#   Mac: brew install tree
#   Ubuntu: sudo apt-get install tree

!tree data

data
├── README
├── audioscrobbler
│   ├── README.txt
│   ├── artist_alias.txt
│   ├── artist_data.txt
│   └── user_artist_data.txt
├── meta_Musical_Instruments.json.gz
├── meta_Patio_Lawn_and_Garden.json.gz
├── meta_Pet_Supplies.json.gz
├── ratings_Pet_Supplies.csv
├── reviews_Musical_Instruments_5.json.gz
├── reviews_Patio_Lawn_and_Garden_5.json.gz
└── reviews_Pet_Supplies_5.json.gz

1 directory, 12 files


In [4]:
!cat data/README

ratings_Category_Name.csv
-------------------------
reviewerID,asin,overall,unixReviewTime


In [5]:
# Load data files
raw_reviews_df = spark.read.json("data/reviews_Pet_Supplies_5.json.gz")
raw_ratings_df = spark.read.csv("data/ratings_Pet_Supplies.csv")
raw_meta_df = spark.read.json("data/meta_Pet_Supplies.json.gz")

In [6]:
# Show the schema of raw_reviews_df
raw_reviews_df.printSchema()

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [8]:
# Show the schema of raw_ratings_df
raw_ratings_df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



In [9]:
# Create a custom schema for the ratings data
ratings_schema = StructType( [
    StructField('reviewerID', StringType(), True),
    StructField('asin', StringType(), True),
    StructField('overall', FloatType(), True),
    StructField('unixReviewTime', IntegerType(), True)
        ])

raw_ratings_df = spark.read.csv("data/ratings_Pet_Supplies.csv",
                               schema=ratings_schema)

raw_ratings_df.printSchema()

root
 |-- reviewerID: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- overall: float (nullable = true)
 |-- unixReviewTime: integer (nullable = true)



In [10]:
raw_ratings_df.rdd.takeSample(withReplacement=False, num=10)

[Row(reviewerID=u'A1FESJ7QIPXPWM', asin=u'B00331A94U', overall=1.0, unixReviewTime=1399161600),
 Row(reviewerID=u'A2TENSV1MP7MFZ', asin=u'B008WN4L8Y', overall=4.0, unixReviewTime=1389657600),
 Row(reviewerID=u'AFX5POZ1MK4MR', asin=u'B0009YWLCM', overall=5.0, unixReviewTime=1294790400),
 Row(reviewerID=u'ADLV04YWJA1BD', asin=u'B0002VAZUM', overall=5.0, unixReviewTime=1377734400),
 Row(reviewerID=u'A2AH0KVNXPTO9E', asin=u'B001UFLP30', overall=5.0, unixReviewTime=1371427200),
 Row(reviewerID=u'A188UOC1VDA3JZ', asin=u'B001SIS14U', overall=5.0, unixReviewTime=1371772800),
 Row(reviewerID=u'A3AQS2SNN2GNMB', asin=u'B0002AT3M4', overall=1.0, unixReviewTime=1358467200),
 Row(reviewerID=u'A29L3MTNZGMCU0', asin=u'B000HHM6FI', overall=2.0, unixReviewTime=1302048000),
 Row(reviewerID=u'AKLRY0PJSUZST', asin=u'B004PWXKJC', overall=5.0, unixReviewTime=1395273600),
 Row(reviewerID=u'A1UPJ6FDCHA0E0', asin=u'B000O5DI3W', overall=5.0, unixReviewTime=1321660800)]

#### We need numeric IDs for users and items



In [13]:
raw_ratings_df.registerTempTable('raw_ratings')

In [14]:
reviewerID_df = spark.sql("""
SELECT reviewerID
FROM raw_ratings
GROUP BY reviewerID
""")

reviewerID_df.show()

+--------------+
|    reviewerID|
+--------------+
|A2I9LHH5LQC4RG|
|A1FC6D8QO02H9U|
| A8YROBZWPMZ8U|
|A3ROGW4X5DFT2Y|
| AXC3YXI1IORXP|
|A1RICCJ679590M|
|A3GGWOBLSL9ZJO|
|A19C16DYVA5KXA|
| AY6SOCFKS98FE|
| A97SZOCQ8XEFL|
| AI8RX43O6P2M9|
|A37ZCYH6TN8A19|
|A1I5QUJRRY971J|
| ABXSEQZLGP8UU|
|A14QH5E4GBNG19|
| A4S5QR5JIVMVR|
|A1SUVGGTO7HI25|
|A324GVROY4NYG7|
| AMXCEKXAOS0IQ|
|A2BGSTMS3OCWTQ|
+--------------+
only showing top 20 rows



In [15]:
asin_df = spark.sql("""
SELECT asin
FROM raw_ratings
GROUP BY asin
""")
asin_df.show()

+----------+
|      asin|
+----------+
|6041027537|
|B00002N8FK|
|B00006OAM1|
|B00008DFTI|
|B00008DFTP|
|B00008Q378|
|B0002563BI|
|B0002563UY|
|B00025K1HU|
|B00026068S|
|B00028ZLNQ|
|B0002APNFA|
|B0002AQBCE|
|B0002AQCXM|
|B0002AQD1I|
|B0002AQO5S|
|B0002AQQVK|
|B0002AR0DS|
|B0002AS5BO|
|B0002ASARI|
+----------+
only showing top 20 rows



In [16]:
asin_item_rdd = asin_df.rdd.zipWithIndex()
asin_item_rdd.takeSample(withReplacement=False, num=5)

[(Row(asin=u'B00BUSKW7C'), 10416),
 (Row(asin=u'B002XEO78M'), 67828),
 (Row(asin=u'B00ECNS9Z4'), 58374),
 (Row(asin=u'B000F9JDAY'), 68698),
 (Row(asin=u'B0002AQDF4'), 10514)]

In [17]:
asin_item_rdd = asin_item_rdd.map(lambda x: (x[0]['asin'], x[1]))
asin_item_rdd.takeSample(withReplacement=False, num=5)

[(u'B00D44ZE2S', 78498),
 (u'B0002DICXM', 44191),
 (u'B00EUV6SCY', 34817),
 (u'B0040DOGOO', 3437),
 (u'B00B200VKG', 74823)]

In [18]:
asin_item_schema = StructType([
    StructField('asin', StringType(), True),
    StructField('item', IntegerType(), True),
    ])
asin_item_df = spark.createDataFrame(asin_item_rdd, asin_item_schema)
asin_item_df
asin_item_df.printSchema()
asin_item_df.show()
asin_item_df.registerTempTable("asin_item")

root
 |-- asin: string (nullable = true)
 |-- item: integer (nullable = true)

+----------+----+
|      asin|item|
+----------+----+
|6041027537|   0|
|B00002N8FK|   1|
|B00006OAM1|   2|
|B00008DFTI|   3|
|B00008DFTP|   4|
|B00008Q378|   5|
|B0002563BI|   6|
|B0002563UY|   7|
|B00025K1HU|   8|
|B00026068S|   9|
|B00028ZLNQ|  10|
|B0002APNFA|  11|
|B0002AQBCE|  12|
|B0002AQCXM|  13|
|B0002AQD1I|  14|
|B0002AQO5S|  15|
|B0002AQQVK|  16|
|B0002AR0DS|  17|
|B0002AS5BO|  18|
|B0002ASARI|  19|
+----------+----+
only showing top 20 rows



In [19]:
reviewerID_df = spark.sql("""
SELECT reviewerID
FROM raw_ratings
GROUP BY reviewerID
""")

In [20]:
reviewerID_user_rdd = reviewerID_df.rdd.zipWithIndex()
reviewerID_user_rdd = reviewerID_user_rdd.map(lambda x: (x[0]['reviewerID'], x[1]))
reviewerID_user_schema = StructType([
    StructField('reviewerID', StringType(), True),
    StructField('user', IntegerType(), True),
    ])
reviewerID_user_df = spark.createDataFrame(reviewerID_user_rdd, reviewerID_user_schema)
reviewerID_user_df.printSchema()
reviewerID_user_df.show()
reviewerID_user_df.registerTempTable("reviewerID_user")

root
 |-- reviewerID: string (nullable = true)
 |-- user: integer (nullable = true)

+--------------+----+
|    reviewerID|user|
+--------------+----+
|A2I9LHH5LQC4RG|   0|
|A1FC6D8QO02H9U|   1|
| A8YROBZWPMZ8U|   2|
|A3ROGW4X5DFT2Y|   3|
| AXC3YXI1IORXP|   4|
|A1RICCJ679590M|   5|
|A3GGWOBLSL9ZJO|   6|
|A19C16DYVA5KXA|   7|
| AY6SOCFKS98FE|   8|
| A97SZOCQ8XEFL|   9|
| AI8RX43O6P2M9|  10|
|A37ZCYH6TN8A19|  11|
|A1I5QUJRRY971J|  12|
| ABXSEQZLGP8UU|  13|
|A14QH5E4GBNG19|  14|
| A4S5QR5JIVMVR|  15|
|A1SUVGGTO7HI25|  16|
|A324GVROY4NYG7|  17|
| AMXCEKXAOS0IQ|  18|
|A2BGSTMS3OCWTQ|  19|
+--------------+----+
only showing top 20 rows



In [None]:
ratings_df = spark.sql("""
SELECT
  raw_ratings.reviewerID
, reviewerID_user.user
, raw_ratings.asin
, asin_item.item
, raw_ratings.overall
, raw_ratings.unixReviewTime
FROM raw_ratings
INNER JOIN reviewerID_user ON
  (reviewerID_user.reviewerID = raw_ratings.reviewerID)
INNER JOIN asin_item ON
  (asin_item.asin = raw_ratings.asin)
""")

ratings_df.printSchema()

In [None]:
ratings_df.show()

In [None]:
# Register the new ratings dataframe as an SQL table
ratings_df.registerTempTable("ratings")

In [None]:
# Train/Test Split
train, test = ratings_df.randomSplit([0.8, 0.2])

In [None]:
als = ALS(rank=10, maxIter=5, seed=0, regParam=0.1, implicitPrefs=False,
          userCol="user", itemCol="item", ratingCol="overall", nonnegative=True)
model = als.fit(train)

In [None]:
predictions = model.transform(test).persist()

In [None]:
predictions.registerTempTable("predictions")

In [None]:
predictions.show()

In [None]:
spark.sql("""
SELECT * FROM predictions
WHERE NOT ISNAN(prediction)
ORDER BY prediction DESC
""").show()

In [None]:
spark.sql("""
SELECT * FROM predictions
WHERE NOT ISNAN(prediction)
ORDER BY prediction
""").show()