In [0]:
from pyspark.sql import SparkSession 

spark = SparkSession \
    .builder \
    .appName("RFM Customer Segmentation with PySpark") \
    .getOrCreate()

In [0]:
df_raw = spark.read.format('delta').\
    options(header = 'true', inferschema = 'true').\
    load("/user/hive/warehouse/online_retail2", header = True)

In [0]:
df_raw.show(5)
df_raw.printSchema()

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2,55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2,75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 5 rows

root
 |-- InvoiceNo: string (nullable =

In [0]:
from pyspark.sql.functions import count

def my_count(df_in):
    df_in.agg(*[count(c).alias(c) for c in df_in.columns]).show()

my_count(df_raw)

+---------+---------+-----------+--------+-----------+---------+----------+-------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+-----------+--------+-----------+---------+----------+-------+
|   541909|   541909|     541909|  541909|     541909|   541909|    541909| 541909|
+---------+---------+-----------+--------+-----------+---------+----------+-------+



In [0]:
df = df_raw.dropna(how = "any")
my_count(df)

+---------+---------+-----------+--------+-----------+---------+----------+-------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+-----------+--------+-----------+---------+----------+-------+
|   541909|   541909|     541909|  541909|     541909|   541909|    541909| 541909|
+---------+---------+-----------+--------+-----------+---------+----------+-------+



In [0]:
from pyspark.sql.functions import to_utc_timestamp, unix_timestamp, lit, datediff, col, when

timeFmt = "MM/dd/yy HH:mm"

df = df.withColumn('NewInvoiceDate', when(col('InvoiceDate').isNotNull(), to_utc_timestamp(unix_timestamp(col('InvoiceDate'), timeFmt).cast('timestamp'), 'UTC')).otherwise(col('InvoiceDate')))

df.show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|NewInvoiceDate|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2,55|     17850|United Kingdom|          null|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|          null|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2,75|     17850|United Kingdom|          null|
|   536365|   84029G|KNITTED UNION FLA...|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|          null|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|          null|
+---------+---------+--------------------+------

In [0]:
from pyspark.sql.functions import round

df = df.withColumn('TotalPrice', round(df.Quantity * df.UnitPrice, 2) )

from pyspark.sql.functions import mean, min, max, sum, datediff, to_date

date_max = df.select(max('NewInvoiceDate')).toPandas()

current = to_utc_timestamp(unix_timestamp(lit(str(date_max.iloc[0][0])), 'yy-MM-dd HH:mm').cast('timestamp'), 'UTC')

df = df.withColumn('Duration', datediff(lit(current), 'NewInvoiceDate'))

#Recency, Frequency, Monetary

recency = df.groupBy('CustomerID').agg(min('Duration').alias('Recency'))

frequency = df.groupBy('CustomerID', 'InvoiceNo').count()\
    .groupBy('CustomerID')\
    .agg(count('*').alias("Frequency"))

monetary = df.groupBy('CustomerID').agg(round(sum('TotalPrice'), 2).alias('Monetary'))

rfm = recency.join(frequency, 'CustomerID', how = 'inner')\
    .join(monetary, 'CustomerID', how = 'inner')

rfm.show()

+----------+-------+---------+--------+
|CustomerID|Recency|Frequency|Monetary|
+----------+-------+---------+--------+
|     15194|   null|       22|    null|
|     17703|   null|        3|    null|
|     13452|   null|        2|   590.0|
|     13098|   null|       41|    null|
|     17048|   null|        6|    null|
|     13638|   null|        1|    null|
|     15322|   null|        2|    null|
|     13723|   null|        1|    null|
|     16597|   null|        1|    null|
|     15237|   null|        4|    null|
|     13248|   null|        2|    null|
|     16742|   null|        2|    null|
|     14719|   null|        6|    null|
|     17043|   null|        4|    null|
|     14117|   null|        1|    null|
|     15057|   null|        2|    null|
|     17979|   null|        5|    null|
|     13460|   null|        2|    null|
|     13518|   null|        1|    null|
|     15432|   null|        1|    null|
+----------+-------+---------+--------+
only showing top 20 rows

