## Compute Binary Target Labels
- Has a user purchased within the last 7 or 14 days?

In [16]:
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import IntegerType

In [17]:
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

In [35]:
# Load user dictionary which maps user_id_hashing to unique integers
users = pd.read_csv('data/user_dict.csv', header=None)
user_dict = {row[1][0]:int(row[1][1]) for row in users.iterrows()}

In [4]:
name = 'user_id_hash'
fn = UserDefinedFunction(lambda x: user_dict[x], IntegerType())

In [5]:
df = ss.read.csv('data/events.csv', header=True, inferSchema=True)
df.show(5)

+----------------+-------------------+------------------+---------------+-----------+--------------------+
|          app_id|         session_id|             event|event_timestamp|event_value|        user_id_hash|
+----------------+-------------------+------------------+---------------+-----------+--------------------+
|4724682771660800|5558845121177764917|                45|  1542215397132|        0.0|9943447915df3a45f...|
|4724682771660800|5558845121177764917|                45|  1542215484895|        0.0|9943447915df3a45f...|
|4724682771660800|7689508378645584666|.m5100869650219008|  1541124410372|        0.0|9943447915df3a45f...|
|4724682771660800|2201961907282901522|                 4|  1543713091129|        0.0|9943447915df3a45f...|
|4724682771660800|2201961907282901522|                 6|  1543713093116|        0.0|9943447915df3a45f...|
+----------------+-------------------+------------------+---------------+-----------+--------------------+
only showing top 5 rows



In [8]:
# Convert user_id_hash to user_id
purchases = df.withColumn('user_id', fn(df.user_id_hash))\
            .drop('user_id_hash', 'app_id', 'session_id')\
            .filter("event == 8")\
            .toPandas()
purchases.head()

Unnamed: 0,event,event_timestamp,event_value,user_id
0,8,1541912600211,3.493,554721
1,8,1543357433771,3.493,424370
2,8,1543023539172,3.493,424370
3,8,1543531139724,1.393,424370
4,8,1541437057644,1.393,171957


In [9]:
from datetime import datetime
purchases['datetime'] = purchases['event_timestamp'].apply(lambda x:datetime.fromtimestamp(x/1000))

In [11]:
# Get purchases for 7 day and 14 day time periods 
purchases14 = purchases[(purchases['datetime'] >= '2018-12-01')
                        & (purchases['datetime'] <= '2018-12-14')
                        & (purchases['event_value'] != 0)]
purchases7 = purchases[(purchases['datetime'] >= '2018-12-01')
                        & (purchases['datetime'] <= '2018-12-07')
                        & (purchases['event_value'] != 0)]

In [12]:
# Get unique users for purchases
labels14 = purchases14.user_id.unique()
labels7 = purchases7.user_id.unique()

array([390699, 151528, 142719, 540810, 147408, 377990, 462529, 203937,
       471260, 422223])

In [49]:
labels14_dict = {user:1 if user in labels14 else 0 for user in user_dict.values()}
labels7_dict = {user:1 if user in labels7 else 0 for user in user_dict.values()}

In [50]:
# Create df of labels and print counts
labels14_df = pd.DataFrame(labels14_dict.items(), columns=['user_id', 'label'])
labels14_df.groupby('label').count()

Unnamed: 0_level_0,user_id
label,Unnamed: 1_level_1
0,615693
1,5807


In [51]:
labels7_df = pd.DataFrame(labels7_dict.items(), columns=['user_id', 'label'])
labels7_df.groupby('label').count()

Unnamed: 0_level_0,user_id
label,Unnamed: 1_level_1
0,617271
1,4229


In [52]:
# Save labels to csv
labels14_df.to_csv(path_or_buf='labels14.csv', index=False)
labels7_df.to_csv(path_or_buf='labels7.csv', index=False)