# Feature Engineering - Purchase events data

In [1]:
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import IntegerType
from datetime import datetime

In [2]:
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

In [14]:
# load user_id to user_hash dictionary
users = pd.read_csv('data/user_dict.csv', header=None)
user_dict = {row[1][0]:int(row[1][1]) for row in users.iterrows()}
name = 'user_id_hash'
fn = UserDefinedFunction(lambda x: user_dict[x], IntegerType())

In [13]:
# load events data into sparksql dataframe
df = ss.read.csv('data/events.csv', header=True, inferSchema=True)
df.show(5)

+----------------+-------------------+------------------+---------------+-----------+--------------------+
|          app_id|         session_id|             event|event_timestamp|event_value|        user_id_hash|
+----------------+-------------------+------------------+---------------+-----------+--------------------+
|4724682771660800|5558845121177764917|                45|  1542215397132|        0.0|9943447915df3a45f...|
|4724682771660800|5558845121177764917|                45|  1542215484895|        0.0|9943447915df3a45f...|
|4724682771660800|7689508378645584666|.m5100869650219008|  1541124410372|        0.0|9943447915df3a45f...|
|4724682771660800|2201961907282901522|                 4|  1543713091129|        0.0|9943447915df3a45f...|
|4724682771660800|2201961907282901522|                 6|  1543713093116|        0.0|9943447915df3a45f...|
+----------------+-------------------+------------------+---------------+-----------+--------------------+
only showing top 5 rows



In [16]:
# Convert user_id_hash to user_id
purchases = df.withColumn('user_id', fn(df.user_id_hash))\
            .drop('user_id_hash', 'app_id', 'session_id')\
            .filter("event == 8")\
            .toPandas()

Unnamed: 0,event,event_timestamp,event_value,user_id
0,8,1541912600211,3.493,554721
1,8,1543357433771,3.493,424370
2,8,1543023539172,3.493,424370
3,8,1543531139724,1.393,424370
4,8,1541437057644,1.393,171957


In [19]:
# create datetime from event_timestamp
purchases['datetime'] = purchases['event_timestamp']\
                        .apply(lambda x:datetime.fromtimestamp(x/1000))
purchases = purchases[(purchases.datetime < '2018-12-01') 
                      & (purchases['event_value'] != 0)]
purchases.head()

Unnamed: 0,event,event_timestamp,event_value,user_id,datetime
0,8,1541912600211,3.493,554721,2018-11-10 21:03:20.211
1,8,1543357433771,3.493,424370,2018-11-27 14:23:53.771
2,8,1543023539172,3.493,424370,2018-11-23 17:38:59.172
3,8,1543531139724,1.393,424370,2018-11-29 14:38:59.724
4,8,1541437057644,1.393,171957,2018-11-05 08:57:37.644


In [61]:
# features for purchase count for last week, two weeks and total
f1 = purchases.groupby('user_id')\
                            .size()\
                            .reset_index(name='purchase_count_total')\
                            .set_index('user_id')
f2 = purchases[purchases.datetime >= '2018-11-24']\
                            .groupby('user_id')\
                            .size()\
                            .reset_index(name='purchase_count_last_week')\
                            .set_index('user_id')
f3 = purchases[purchases.datetime >= '2018-11-17']\
                            .groupby('user_id').size()\
                            .reset_index(name='purchase_count_2_weeks')\
                            .set_index('user_id')

In [62]:
# features for purchase sum for last week, two weeks and total
f4 = purchases.groupby('user_id')['event_value']\
                            .agg('sum')\
                            .reset_index(name='purchase_sum_total')\
                            .set_index('user_id')
f5 = purchases[purchases.datetime >= '2018-11-24']\
                            .groupby('user_id')['event_value']\
                            .agg('sum')\
                            .reset_index(name='purchase_sum_last_week')\
                            .set_index('user_id')
f6 = purchases[purchases.datetime >= '2018-11-17']\
                            .groupby('user_id')['event_value']\
                            .agg('sum')\
                            .reset_index(name='purchase_sum_2_weeks')\
                            .set_index('user_id')

In [73]:
# create a dummy df with all user_ids and then join with other features
# this ensures we get zeros in addition to ones
dummy = pd.DataFrame({'user_id': list(user_dict.values()), 'dummy': [0 for _ in range(len(user_dict.values()))]})
features_lst = [f1, f2, f3, f4, f5, f6]
features = dummy.join(features_lst[:]).drop(axis=1, columns='dummy').fillna(value=0)
features.head()

Unnamed: 0,user_id,purchase_count_total,purchase_count_last_week,purchase_count_2_weeks,purchase_sum_total,purchase_sum_last_week,purchase_sum_2_weeks
0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
# check how many users in the dataset have made a purchase
print(sum(features.purchase_count_total != 0))
print(sum(features.purchase_count_total == 0))
32752/588748

32752
588748


0.055629912967857215

In [76]:
features.to_csv(path_or_buf='features_events.csv', index=False)