In [1]:
from pyspark.sql import SparkSession

In [2]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time
from pyspark.sql import SQLContext
from cassandra.cluster import Cluster


In [3]:
spark = SparkSession.builder.appName('lr_example').config('spark.cassandra.connection.host', 'localhost').getOrCreate()

In [4]:
sqlContext = SQLContext(spark)

In [5]:
# hdfs://localhost:9000/kafka_spark

In [6]:
data = spark.read.csv("hdfs://localhost:9000/kafka_spark", inferSchema=True, header=True)

In [7]:
data.printSchema()

root
 |-- action: string (nullable = true)
 |-- page: string (nullable = true)
 |-- prevPage: string (nullable = true)
 |-- product: string (nullable = true)
 |-- referrer: string (nullable = true)
 |-- timestamp_hour: long (nullable = true)
 |-- visitor: string (nullable = true)



In [8]:
data.columns

['action',
 'page',
 'prevPage',
 'product',
 'referrer',
 'timestamp_hour',
 'visitor']

In [9]:
data.show()

+-----------+-------+--------+--------------------+--------+--------------+--------------+
|     action|   page|prevPage|             product|referrer|timestamp_hour|       visitor|
+-----------+-------+--------+--------------------+--------+--------------+--------------+
|    purchas| Page-0|    null|Kind,Dark Chocola...|    Bing| 1521699747500|Visitor-419410|
|add_to_cart| Page-0|    null|O-Live And Compan...|  Google| 1521699747500|Visitor-981800|
|  page_view| Page-9|    null|Parallax Inc.,Ext...| Twitter| 1521699747980|Visitor-529614|
|  page_view| Page-1|    null|Mrs. Meyer's Clea...|    Bing| 1521699747980|Visitor-337395|
|  page_view|Page-13|    null|Scrabble,Crosswor...|   Other| 1521699747980|Visitor-374554|
|  page_view|Page-14|    null|Mrs. Meyer's Clea...|    Bing| 1521699747980|Visitor-929627|
|  page_view|Page-10|    null|Simply Organic,It...| Twitter| 1521699747980|Visitor-611058|
|  page_view| Page-8|    null|Ancient Harvest,Q...|    Bing| 1521699756140|  Visitor-3760|

In [10]:
data.show(7, truncate=False)

+-----------+-------+--------+----------------------------------------------------------------+--------+--------------+--------------+
|action     |page   |prevPage|product                                                         |referrer|timestamp_hour|visitor       |
+-----------+-------+--------+----------------------------------------------------------------+--------+--------------+--------------+
|purchas    |Page-0 |null    |Kind,Dark Chocolate Nuts & Sea Salt Protein Bar                 |Bing    |1521699747500 |Visitor-419410|
|add_to_cart|Page-0 |null    |O-Live And Company,Extra Virgin Olive Oil                       |Google  |1521699747500 |Visitor-981800|
|page_view  |Page-9 |null    |Parallax Inc.,Extension Cables & 3-Pin Headers                  |Twitter |1521699747980 |Visitor-529614|
|page_view  |Page-1 |null    |Mrs. Meyer's Clean Day,"Liquid Hand Soap Refill, Lavender Scent"|Bing    |1521699747980 |Visitor-337395|
|page_view  |Page-13|null    |Scrabble,Crossword Game  

In [11]:
def timestamp_conversion(timestamp):
        if timestamp is not None:
            return time.strftime('%Y-%m-%d', time.gmtime(timestamp/1000))
        else:
            return "null null"

In [12]:
time_udf = udf(timestamp_conversion,StringType())

In [13]:
inputDF = data.withColumn("timestamp_hour",time_udf(data["timestamp_hour"]))
inputDF.createOrReplaceTempView("activity")

In [14]:
visitorsByProduct = sqlContext.sql("SELECT product, timestamp_hour, COUNT(DISTINCT visitor) as unique_visitors FROM Activity GROUP BY product, timestamp_hour")

activityByProduct = sqlContext.sql("SELECT product, timestamp_hour, sum(case when action = 'purchase' then 1 else 0 end) as purchase_count, sum(case when action = 'add_to_cart' then 1 else 0 end) as add_to_cart_count, sum(case when action = 'page_view' then 1 else 0 end) as page_view_count from Activity group by product, timestamp_hour").cache()

In [15]:
visitorsByProduct.show(5)

+--------------------+--------------+---------------+
|             product|timestamp_hour|unique_visitors|
+--------------------+--------------+---------------+
|Gatorade,Fierce G...|    2018-03-22|             95|
|All,"Mighty Pacs,...|    2018-03-22|             81|
| Q-tips,Cotton Swabs|    2018-03-22|             99|
|Scott,One Ply Bat...|    2018-03-22|             85|
|Viano Vineyards,C...|    2018-03-22|             89|
+--------------------+--------------+---------------+
only showing top 5 rows



In [16]:
activityByProduct.show(5)

+--------------------+--------------+--------------+-----------------+---------------+
|             product|timestamp_hour|purchase_count|add_to_cart_count|page_view_count|
+--------------------+--------------+--------------+-----------------+---------------+
|Gatorade,Fierce G...|    2018-03-22|             0|                4|             84|
| Q-tips,Cotton Swabs|    2018-03-22|             0|                5|             92|
|    Benadryl,Allergy|    2018-03-22|             0|                5|             79|
|Viano Vineyards,C...|    2018-03-22|             0|                2|             84|
|Scott,One Ply Bat...|    2018-03-22|             0|                3|             80|
+--------------------+--------------+--------------+-----------------+---------------+
only showing top 5 rows



In [17]:
cluster = Cluster()

In [18]:
session = cluster.connect()

In [19]:
session.set_keyspace('lambda')

In [20]:
visitorProduct = visitorsByProduct.rdd.map(lambda x: (x.product, x.timestamp_hour, x.unique_visitors))

In [21]:
import re
from cassandra.query import BatchStatement

In [22]:
# inserting into batch_visitors_by_product
query = session.prepare("INSERT INTO batch_visitors_by_product (product, timestamp_hour, unique_visitors) VALUES (?,?,?)")
for row in visitorsByProduct.rdd.collect():
    session.execute(query, [row['product'], row['timestamp_hour'], row['unique_visitors']])

In [24]:
# inserting into batch_activity_by_product
query = session.prepare("INSERT INTO batch_activity_by_product (product, timestamp_hour, add_to_cart_count, page_view_count, purchase_count) VALUES (?, ?, ?, ?, ?)")
for row in activityByProduct.rdd.collect():
    session.execute(query, [row['product'], row['timestamp_hour'], row['purchase_count'], row['add_to_cart_count'],row['page_view_count']])