In [1]:
OUTPUT_BUCKET_FOLDER = "gs://<GCS_BUCKET_NAME>/outbrain-click-prediction/output/"
DATA_BUCKET_FOLDER = "gs://<GCS_BUCKET_NAME>/outbrain-click-prediction/data/"

In [3]:
from pyspark.sql.types import *
import pyspark.sql.functions as F

## Loading data

In [4]:
truncate_day_from_timestamp_udf = F.udf(lambda ts: int(ts / 1000 / 60 / 60 / 24), IntegerType())

In [5]:
events_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("uuid_event", StringType(), True),                    
                    StructField("document_id_event", IntegerType(), True),
                    StructField("timestamp_event", IntegerType(), True),
                    StructField("platform_event", IntegerType(), True),
                    StructField("geo_location_event", StringType(), True)]
                    )

events_df = spark.read.schema(events_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER + "events.csv") \
                .withColumn('day_event', truncate_day_from_timestamp_udf('timestamp_event')) \
                .alias('events')   

In [6]:
promoted_content_schema = StructType(
                    [StructField("ad_id", IntegerType(), True),
                    StructField("document_id_promo", IntegerType(), True),                    
                    StructField("campaign_id", IntegerType(), True),
                    StructField("advertiser_id", IntegerType(), True)]
                    )

promoted_content_df = spark.read.schema(promoted_content_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"promoted_content.csv") \
                .alias('promoted_content')

In [7]:
clicks_train_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("ad_id", IntegerType(), True),                    
                    StructField("clicked", IntegerType(), True)]
                    )

clicks_train_df = spark.read.schema(clicks_train_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"clicks_train.csv") \
                .alias('clicks_train')

In [9]:
clicks_train_joined_df = clicks_train_df \
                         .join(promoted_content_df, on='ad_id', how='left') \
                         .join(events_df, on='display_id', how='left')                         
clicks_train_joined_df.createOrReplaceTempView('clicks_train_joined')

In [10]:
validation_display_ids_df = clicks_train_joined_df.select('display_id','day_event').distinct() \
                                .sampleBy("day_event", fractions={0: 0.2, 1: 0.2, 2: 0.2, 3: 0.2, 4: 0.2, \
                                                                5: 0.2, 6: 0.2, 7: 0.2, 8: 0.2, 9: 0.2, 10: 0.2, \
                                                               11: 1.0, 12: 1.0}, seed=0)   
validation_display_ids_df.createOrReplaceTempView("validation_display_ids")                                                                 

In [11]:
validation_set_df = spark.sql('''SELECT display_id, ad_id, uuid_event, day_event, timestamp_event,
                                        document_id_promo, platform_event, geo_location_event FROM clicks_train_joined t 
             WHERE EXISTS (SELECT display_id FROM validation_display_ids 
                           WHERE display_id = t.display_id)''')

In [12]:
validation_set_gcs_output = "validation_set.parquet"
validation_set_df.write.parquet(OUTPUT_BUCKET_FOLDER+validation_set_gcs_output, mode='overwrite')

In [13]:
validation_set_df.take(5)

[Row(display_id=2122, ad_id=36619, uuid_event='7ceed8e24a87d7', day_event=0, timestamp_event=148795, document_id_promo=899906, platform_event=3, geo_location_event='SG>00'),
 Row(display_id=2122, ad_id=81643, uuid_event='7ceed8e24a87d7', day_event=0, timestamp_event=148795, document_id_promo=1094108, platform_event=3, geo_location_event='SG>00'),
 Row(display_id=2122, ad_id=216100, uuid_event='7ceed8e24a87d7', day_event=0, timestamp_event=148795, document_id_promo=1548042, platform_event=3, geo_location_event='SG>00'),
 Row(display_id=2659, ad_id=55819, uuid_event='964e40766c3f39', day_event=0, timestamp_event=185389, document_id_promo=986576, platform_event=3, geo_location_event='CA>BC'),
 Row(display_id=2659, ad_id=76816, uuid_event='964e40766c3f39', day_event=0, timestamp_event=185389, document_id_promo=824972, platform_event=3, geo_location_event='CA>BC')]