## set up

In [6]:
evaluation = True
evaluation_verbose = False

OUTPUT_BUCKET_FOLDER = "gs://capstone-01/output/"
DATA_BUCKET_FOLDER = "gs://capstone-01/data/"

In [1]:
from IPython.display import display

In [2]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT

In [3]:
import numpy as np
import scipy.sparse

In [4]:
import math
import datetime
import time
import itertools

In [5]:
import pickle

In [7]:
import random
random.seed(42)

In [8]:
import pandas as pd
%matplotlib inline

In [9]:
start_time = time.time()

In [10]:
# 해시 함수를 가진 모듈 hashlib
import hashlib
def hashstr(s, nr_bins):
    return int(hashlib.md5(s.encode('utf8')).hexdigest(), 16)%(nr_bins-1)+1

## UDFs

In [11]:
# date_time_to_unix_epoch 함수 생성
def date_time_to_unix_epoch(date_time):
    return int(time.mktime(date_time.timetuple()))

# timetuple() : date 객체의 값을 time.struct_time 시퀀스 객체에 할당. 해당되는 정보가 없는 시,분,초는 '0'으로 초기화
# d = datetime.date.today()
# d.timetuple() -> time.struct_time(tm_year=2011, tm_mon=11...)
# time.mktime() : 인자로 받아서 time()과 같은 누적된 초를 반환  
# t = (2009, 2, 17, 17, 3, 38, 1, 48, 0)
# time.mktime(t) : 1234915418.000000
# asctime(localtime(secs)): Tue Feb 17 17:03:38 2009
  
# date_tiem_to_unix_epoch_treated 함수 생성
# try 블록 수행 중 오류가 발생하면 except 블록이 수행된다
# 입력 변수에 대해 date_time_to_unix_epoch 함수값이 나오지 않으면 오류 발생시킨다
def date_time_to_unix_epoch_treated(dt):
    if dt != None:
        try:
            epoch = date_time_to_unix_epoch(dt)
            return epoch
        except Exception as e:
            print("Error processing dt={}".format(dt), e)
            return 0
    else:
        return 0

In [12]:
# epoch를 integer로 반환
timestamp_null_to_zero_int_udf = F.udf(lambda x: date_time_to_unix_epoch_treated(x), IntegerType())

In [13]:
# int_null_to_minus_one_udf -> int가 null값이면 -1을 반환하는 함수
# 아래 3개 함수 -> 각각 int, float, str 타입인 리스트 형태 데이터가 null 값이면 빈 리스트 반환하는 함수

INT_DEFAULT_NULL_VALUE = -1
int_null_to_minus_one_udf = F.udf(lambda x: x if x != None else INT_DEFAULT_NULL_VALUE, IntegerType())
int_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(IntegerType()))
float_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(FloatType()))
str_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(StringType()))

In [14]:
def truncate_day_from_timestamp(ts):
    return int(ts / 1000 / 60 / 60 / 24)

In [15]:
truncate_day_from_timestamp_udf = F.udf(lambda ts: truncate_day_from_timestamp(ts), IntegerType())

In [16]:
# extract_country_udf는 geo_location에서 앞 두 글자(국가명)만 가져오는 함수

extract_country_udf = F.udf(lambda geo: geo.strip()[:2] if geo != None else '', StringType())

In [17]:
# extract_country_udf는 geo_location에서 앞 다섯 글자(국가명+주명)만 가져오는 함수

extract_country_state_udf = F.udf(lambda geo: geo.strip()[:5] if geo != None else '', StringType())

In [18]:
# list_len_udf는 input값의 길이를 반환하는 함수

list_len_udf = F.udf(lambda x: len(x) if x != None else 0, IntegerType())

In [19]:
# convert_odd_timestamp는 timestamp 원데이터 값을 년월일 형식으로 변환해주는 함수
# kaggle data explanation : If you wish to recover the actual epoch time of the visit, add 1465876799998 to the timestamp.

def convert_odd_timestamp(timestamp_ms_relative):
    TIMESTAMP_DELTA=1465876799998
    return datetime.datetime.fromtimestamp((int(timestamp_ms_relative)+TIMESTAMP_DELTA)//1000)

In [20]:
convert_odd_timestamp(61)

datetime.datetime(2016, 6, 14, 4, 0)

## load files : Loading UTC/BST for each country and US / CA states (local time)

In [21]:
# country_codes_utc_dst_tz_delta.csv는 다음 2개 컬럼을 가짐 -> country_code, utc_dst_time_offset_cleaned

country_utc_dst_df = pd.read_csv('country_codes_utc_dst_tz_delta.csv', keep_default_na=False)

In [22]:
countries_utc_dst_dict = dict(zip(country_utc_dst_df['country_code'].tolist(), country_utc_dst_df['utc_dst_time_offset_cleaned'].tolist()))
countries_utc_dst_broad = sc.broadcast(countries_utc_dst_dict)

In [23]:
# us_states_abbrev_bst.csv는 다음 2개 컬럼을 가짐 -> state_abb, utc_dst_time_offset_cleaned

us_states_utc_dst_df = pd.read_csv('us_states_abbrev_bst.csv', keep_default_na=False)

In [24]:
us_states_utc_dst_dict = dict(zip(us_states_utc_dst_df['state_abb'].tolist(), us_states_utc_dst_df['utc_dst_time_offset_cleaned'].tolist()))
us_states_utc_dst_broad = sc.broadcast(us_states_utc_dst_dict)

In [25]:
# ca_states_abbrev_bst.csv는 다음 2개 컬럼을 가짐 -> state_abb, utc_dst_time_offset_cleaned

ca_states_utc_dst_df = pd.read_csv('ca_states_abbrev_bst.csv', keep_default_na=False)

In [26]:
ca_countries_utc_dst_dict = dict(zip(ca_states_utc_dst_df['state_abb'].tolist(), ca_states_utc_dst_df['utc_dst_time_offset_cleaned'].tolist()))
ca_countries_utc_dst_broad = sc.broadcast(ca_countries_utc_dst_dict)

## load data

In [27]:
events_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("uuid_event", StringType(), True),                    
                    StructField("document_id_event", IntegerType(), True),
                    StructField("timestamp_event", IntegerType(), True),
                    StructField("platform_event", IntegerType(), True),
                    StructField("geo_location_event", StringType(), True)]
                    )

events_df = spark.read.schema(events_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER + "events.csv") \
                .withColumn('dummyEvents', F.lit(1)) \
                .withColumn('day_event', truncate_day_from_timestamp_udf('timestamp_event')) \
                .withColumn('event_country', extract_country_udf('geo_location_event')) \
                .withColumn('event_country_state', extract_country_state_udf('geo_location_event')) \
                .alias('events')

In [50]:
events_df.show(3)

+----------+--------------+-----------------+---------------+--------------+------------------+-----------+---------+-------------+-------------------+
|display_id|    uuid_event|document_id_event|timestamp_event|platform_event|geo_location_event|dummyEvents|day_event|event_country|event_country_state|
+----------+--------------+-----------------+---------------+--------------+------------------+-----------+---------+-------------+-------------------+
|         1|cb8c55702adb93|           379743|             61|             3|         US>SC>519|          1|        0|           US|              US>SC|
|         2|79a85fa78311b9|          1794259|             81|             2|         US>CA>807|          1|        0|           US|              US>CA|
|         3|822932ce3d8757|          1179111|            182|             2|         US>MI>505|          1|        0|           US|              US>MI|
+----------+--------------+-----------------+---------------+--------------+------------

In [28]:
page_views_schema = StructType(
                    [StructField("uuid_pv", StringType(), True),
                    StructField("document_id_pv", IntegerType(), True),
                    StructField("timestamp_pv", IntegerType(), True),
                    StructField("platform_pv", IntegerType(), True),
                    StructField("geo_location_pv", StringType(), True),
                    StructField("traffic_source_pv", IntegerType(), True)]
                    )

# Google Storage에 저장된 page_views 파일 로드
page_views_df = spark.read.schema(page_views_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv("gs://upload-bigquery180927/page_views.csv") \
                .withColumn('day_pv', truncate_day_from_timestamp_udf('timestamp_pv')) \
                .alias('page_views')             
            
page_views_df.createOrReplaceTempView('page_views')

In [29]:
page_views_users_df  = spark.sql('''
                    SELECT uuid_pv, document_id_pv, max(timestamp_pv) as max_timestamp_pv, 1 as dummyPageView
                    FROM page_views p 
                    GROUP BY uuid_pv, document_id_pv
                    ''').alias('page_views_users')

In [30]:
promoted_content_schema = StructType(
                    [StructField("ad_id", IntegerType(), True),
                    StructField("document_id_promo", IntegerType(), True),                    
                    StructField("campaign_id", IntegerType(), True),
                    StructField("advertiser_id", IntegerType(), True)]
                    )

promoted_content_df = spark.read.schema(promoted_content_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"promoted_content.csv") \
                .withColumn('dummyPromotedContent', F.lit(1)).alias('promoted_content').cache()

In [31]:
documents_meta_schema = StructType(
                    [StructField("document_id_doc", IntegerType(), True),
                    StructField("source_id", IntegerType(), True),                    
                    StructField("publisher_id", IntegerType(), True),
                    StructField("publish_time", TimestampType(), True)]
                    )

documents_meta_df = spark.read.schema(documents_meta_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"documents_meta.csv") \
                .withColumn('dummyDocumentsMeta', F.lit(1)).alias('documents_meta').cache()

In [32]:
# Joining with Page Views to get traffic_source_pv
# events, documents_meta를 조인 (key='document_id')
# 위 데이터프레임과 page_views를 조인 (key=5개 컬럼의 데이터가 다 일치하는 데이터만 가져옴)
events_joined_df = events_df.join(documents_meta_df \
                                  .withColumnRenamed('source_id', 'source_id_doc_event') \
                                  .withColumnRenamed('publisher_id', 'publisher_doc_event') \
                                  .withColumnRenamed('publish_time', 'publish_time_doc_event')
                                  , on=F.col("document_id_event") == F.col("document_id_doc"), how='left') \
                            .join(page_views_df, 
                                           on=[F.col('uuid_event') == F.col('uuid_pv'),
                                               F.col('document_id_event') == F.col('document_id_pv'),
                                               F.col('platform_event') == F.col('platform_pv'),
                                               F.col('geo_location_event') == F.col('geo_location_pv'),
                                               F.col('day_event') == F.col('day_pv')],
                                           how='left') \
                                    .alias('events').cache()

In [54]:
# events_joined_df.show(10)

In [33]:
documents_categories_schema = StructType(
                    [StructField("document_id_cat", IntegerType(), True),
                    StructField("category_id", IntegerType(), True),                    
                    StructField("confidence_level_cat", FloatType(), True)]
                    )

documents_categories_df = spark.read.schema(documents_categories_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"documents_categories.csv") \
                .alias('documents_categories').cache()
    
documents_categories_grouped_df = documents_categories_df.groupBy('document_id_cat') \
                                            .agg(F.collect_list('category_id').alias('category_id_list'),
                                                 F.collect_list('confidence_level_cat').alias('confidence_level_cat_list')) \
                                            .withColumn('dummyDocumentsCategory', F.lit(1)) \
                                            .alias('documents_categories_grouped')

In [34]:
documents_topics_schema = StructType(
                    [StructField("document_id_top", IntegerType(), True),
                    StructField("topic_id", IntegerType(), True),                    
                    StructField("confidence_level_top", FloatType(), True)]
                    )

documents_topics_df = spark.read.schema(documents_topics_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"documents_topics.csv")  \
                .alias('documents_topics').cache()
    
documents_topics_grouped_df = documents_topics_df.groupBy('document_id_top') \
                                            .agg(F.collect_list('topic_id').alias('topic_id_list'),
                                                 F.collect_list('confidence_level_top').alias('confidence_level_top_list')) \
                                            .withColumn('dummyDocumentsTopics', F.lit(1)) \
                                            .alias('documents_topics_grouped')

In [35]:
documents_entities_schema = StructType(
                    [StructField("document_id_ent", IntegerType(), True),
                    StructField("entity_id", StringType(), True),                    
                    StructField("confidence_level_ent", FloatType(), True)]
                    )

documents_entities_df = spark.read.schema(documents_entities_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"documents_entities.csv")  \
                .alias('documents_entities').cache()
    
documents_entities_grouped_df = documents_entities_df.groupBy('document_id_ent') \
                                            .agg(F.collect_list('entity_id').alias('entity_id_list'),
                                                 F.collect_list('confidence_level_ent').alias('confidence_level_ent_list')) \
                                            .withColumn('dummyDocumentsEntities', F.lit(1)) \
                                            .alias('documents_entities_grouped')

In [36]:
clicks_train_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("ad_id", IntegerType(), True),                    
                    StructField("clicked", IntegerType(), True)]
                    )

clicks_train_df = spark.read.schema(clicks_train_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"clicks_train.csv") \
                .withColumn('dummyClicksTrain', F.lit(1)).alias('clicks_train')

In [37]:
clicks_train_joined_df = clicks_train_df \
                         .join(promoted_content_df, on='ad_id', how='left') \
                         .join(documents_meta_df, on=F.col("promoted_content.document_id_promo") == F.col("documents_meta.document_id_doc"), how='left') \
                         .join(events_joined_df, on='display_id', how='left')                         
clicks_train_joined_df.createOrReplaceTempView('clicks_train_joined')

In [69]:
clicks_train_df.printSchema()

root
 |-- display_id: integer (nullable = true)
 |-- ad_id: integer (nullable = true)
 |-- clicked: integer (nullable = true)
 |-- dummyClicksTrain: integer (nullable = false)



In [66]:
promoted_content_df.printSchema()

root
 |-- ad_id: integer (nullable = true)
 |-- document_id_promo: integer (nullable = true)
 |-- campaign_id: integer (nullable = true)
 |-- advertiser_id: integer (nullable = true)
 |-- dummyPromotedContent: integer (nullable = false)



In [67]:
documents_meta_df.printSchema()

root
 |-- document_id_doc: integer (nullable = true)
 |-- source_id: integer (nullable = true)
 |-- publisher_id: integer (nullable = true)
 |-- publish_time: timestamp (nullable = true)
 |-- dummyDocumentsMeta: integer (nullable = false)



In [68]:
events_joined_df.printSchema()

root
 |-- display_id: integer (nullable = true)
 |-- uuid_event: string (nullable = true)
 |-- document_id_event: integer (nullable = true)
 |-- timestamp_event: integer (nullable = true)
 |-- platform_event: integer (nullable = true)
 |-- geo_location_event: string (nullable = true)
 |-- dummyEvents: integer (nullable = false)
 |-- day_event: integer (nullable = true)
 |-- event_country: string (nullable = true)
 |-- event_country_state: string (nullable = true)
 |-- document_id_doc: integer (nullable = true)
 |-- source_id_doc_event: integer (nullable = true)
 |-- publisher_doc_event: integer (nullable = true)
 |-- publish_time_doc_event: timestamp (nullable = true)
 |-- dummyDocumentsMeta: integer (nullable = true)
 |-- uuid_pv: string (nullable = true)
 |-- document_id_pv: integer (nullable = true)
 |-- timestamp_pv: integer (nullable = true)
 |-- platform_pv: integer (nullable = true)
 |-- geo_location_pv: string (nullable = true)
 |-- traffic_source_pv: integer (nullable = true)


In [65]:
clicks_train_joined_df.printSchema()

root
 |-- display_id: integer (nullable = true)
 |-- ad_id: integer (nullable = true)
 |-- clicked: integer (nullable = true)
 |-- dummyClicksTrain: integer (nullable = false)
 |-- document_id_promo: integer (nullable = true)
 |-- campaign_id: integer (nullable = true)
 |-- advertiser_id: integer (nullable = true)
 |-- dummyPromotedContent: integer (nullable = true)
 |-- document_id_doc: integer (nullable = true)
 |-- source_id: integer (nullable = true)
 |-- publisher_id: integer (nullable = true)
 |-- publish_time: timestamp (nullable = true)
 |-- dummyDocumentsMeta: integer (nullable = true)
 |-- uuid_event: string (nullable = true)
 |-- document_id_event: integer (nullable = true)
 |-- timestamp_event: integer (nullable = true)
 |-- platform_event: integer (nullable = true)
 |-- geo_location_event: string (nullable = true)
 |-- dummyEvents: integer (nullable = true)
 |-- day_event: integer (nullable = true)
 |-- event_country: string (nullable = true)
 |-- event_country_state: stri

In [42]:
%time clicks_train_joined_df.show(3)

+----------+------+-------+----------------+-----------------+-----------+-------------+--------------------+---------------+---------+------------+-------------------+------------------+--------------+-----------------+---------------+--------------+------------------+-----------+---------+-------------+-------------------+---------------+-------------------+-------------------+----------------------+------------------+--------------+--------------+------------+-----------+---------------+-----------------+------+
|display_id| ad_id|clicked|dummyClicksTrain|document_id_promo|campaign_id|advertiser_id|dummyPromotedContent|document_id_doc|source_id|publisher_id|       publish_time|dummyDocumentsMeta|    uuid_event|document_id_event|timestamp_event|platform_event|geo_location_event|dummyEvents|day_event|event_country|event_country_state|document_id_doc|source_id_doc_event|publisher_doc_event|publish_time_doc_event|dummyDocumentsMeta|       uuid_pv|document_id_pv|timestamp_pv|platform_pv|

In [38]:
if evaluation:
    table_name = 'user_profiles_eval'
else:
    table_name = 'user_profiles'

user_profiles_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER+table_name) \
                    .withColumn('dummyUserProfiles', F.lit(1)).alias('user_profiles')

In [39]:
user_profiles_df.count()

4961756

## Spliting Train/validation set | Test set

In [40]:
if evaluation:       
    validation_set_exported_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER+"validation_set.parquet") \
                    .alias('validation_set') 
            
    validation_set_exported_df.select('display_id').distinct().createOrReplaceTempView("validation_display_ids")
    
    
    validation_set_df = spark.sql('''SELECT * FROM clicks_train_joined t 
             WHERE EXISTS (SELECT display_id FROM validation_display_ids 
                           WHERE display_id = t.display_id)''').alias('clicks') \
                         .join(documents_categories_grouped_df, on=F.col("document_id_promo") == F.col("documents_categories_grouped.document_id_cat"), how='left') \
                         .join(documents_topics_grouped_df, on=F.col("document_id_promo") == F.col("documents_topics_grouped.document_id_top"), how='left') \
                         .join(documents_entities_grouped_df, on=F.col("document_id_promo") == F.col("documents_entities_grouped.document_id_ent"), how='left') \
                         .join(documents_categories_grouped_df \
                                   .withColumnRenamed('category_id_list', 'doc_event_category_id_list')
                                   .withColumnRenamed('confidence_level_cat_list', 'doc_event_confidence_level_cat_list') \
                                   .alias('documents_event_categories_grouped'), 
                               on=F.col("document_id_event") == F.col("documents_event_categories_grouped.document_id_cat"), 
                               how='left') \
                         .join(documents_topics_grouped_df \
                                   .withColumnRenamed('topic_id_list', 'doc_event_topic_id_list')
                                   .withColumnRenamed('confidence_level_top_list', 'doc_event_confidence_level_top_list') \
                                   .alias('documents_event_topics_grouped'), 
                               on=F.col("document_id_event") == F.col("documents_event_topics_grouped.document_id_top"), 
                               how='left') \
                         .join(documents_entities_grouped_df \
                                   .withColumnRenamed('entity_id_list', 'doc_event_entity_id_list')
                                   .withColumnRenamed('confidence_level_ent_list', 'doc_event_confidence_level_ent_list') \
                                   .alias('documents_event_entities_grouped'), 
                               on=F.col("document_id_event") == F.col("documents_event_entities_grouped.document_id_ent"), 
                               how='left') \
                         .join(page_views_users_df, on=[F.col("clicks.uuid_event") == F.col("page_views_users.uuid_pv"),
                                                        F.col("clicks.document_id_promo") == F.col("page_views_users.document_id_pv")], 
                                                  how='left')
    
    #print("validation_set_df.count() =", validation_set_df.count())
        
    #Added to validation set information about the event and the user for statistics of the error (avg ctr)
    validation_set_ground_truth_df = validation_set_df.filter('clicked = 1') \
                                .join(user_profiles_df, on=[F.col("user_profiles.uuid") == F.col("uuid_event")], how='left') \
                                .withColumn('user_categories_count', list_len_udf('category_id_list')) \
                                .withColumn('user_topics_count', list_len_udf('topic_id_list')) \
                                .withColumn('user_entities_count', list_len_udf('entity_id_list')) \
                                .select('display_id','ad_id','platform_event', 'day_event', 'timestamp_event', 
                                        'geo_location_event', 'event_country', 'event_country_state', 'views',
                                        'user_categories_count', 'user_topics_count', 'user_entities_count') \
                                .withColumnRenamed('ad_id','ad_id_gt') \
                                .withColumnRenamed('views','user_views_count') \
                                .cache()
    #print("validation_set_ground_truth_df.count() =", validation_set_ground_truth_df.count())
    
    train_set_df = spark.sql('''SELECT * FROM clicks_train_joined t 
                                 WHERE NOT EXISTS (SELECT display_id FROM validation_display_ids 
                                               WHERE display_id = t.display_id)''').cache()
    print("train_set_df.count() =", train_set_df.count())
    
    #validation_display_ids_df.groupBy("day_event").count().show()
    
else:
    
    clicks_test_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("ad_id", IntegerType(), True)]
                    )

    clicks_test_df = spark.read.schema(clicks_test_schema).options(header='true', inferschema='false', nullValue='\\N') \
                    .csv(DATA_BUCKET_FOLDER + "clicks_test.csv") \
                    .withColumn('dummyClicksTest', F.lit(1)) \
                    .withColumn('clicked', F.lit(-999)) \
                    .alias('clicks_test')
        
        
    test_set_df = clicks_test_df \
                         .join(promoted_content_df, on='ad_id', how='left') \
                         .join(documents_meta_df, on=F.col("promoted_content.document_id_promo") == F.col("documents_meta.document_id_doc"), how='left') \
                         .join(documents_categories_grouped_df, on=F.col("document_id_promo") == F.col("documents_categories_grouped.document_id_cat"), how='left') \
                         .join(documents_topics_grouped_df, on=F.col("document_id_promo") == F.col("documents_topics_grouped.document_id_top"), how='left') \
                         .join(documents_entities_grouped_df, on=F.col("document_id_promo") == F.col("documents_entities_grouped.document_id_ent"), how='left') \
                         .join(events_joined_df, on='display_id', how='left') \
                         .join(documents_categories_grouped_df \
                                   .withColumnRenamed('category_id_list', 'doc_event_category_id_list')
                                   .withColumnRenamed('confidence_level_cat_list', 'doc_event_confidence_level_cat_list') \
                                   .alias('documents_event_categories_grouped'), 
                               on=F.col("document_id_event") == F.col("documents_event_categories_grouped.document_id_cat"), 
                               how='left') \
                         .join(documents_topics_grouped_df \
                                   .withColumnRenamed('topic_id_list', 'doc_event_topic_id_list')
                                   .withColumnRenamed('confidence_level_top_list', 'doc_event_confidence_level_top_list') \
                                   .alias('documents_event_topics_grouped'), 
                               on=F.col("document_id_event") == F.col("documents_event_topics_grouped.document_id_top"), 
                               how='left') \
                         .join(documents_entities_grouped_df \
                                   .withColumnRenamed('entity_id_list', 'doc_event_entity_id_list')
                                   .withColumnRenamed('confidence_level_ent_list', 'doc_event_confidence_level_ent_list') \
                                   .alias('documents_event_entities_grouped'), 
                               on=F.col("document_id_event") == F.col("documents_event_entities_grouped.document_id_ent"), 
                               how='left') \
                         .join(page_views_users_df, on=[F.col("events.uuid_event") == F.col("page_views_users.uuid_pv"),
                                                        F.col("promoted_content.document_id_promo") == F.col("page_views_users.document_id_pv")], 
                                                  how='left')

    #print("test_set_df.count() =",test_set_df.count())
   
    
    train_set_df = clicks_train_joined_df.cache() 
    print("train_set_df.count() =", train_set_df.count())


('train_set_df.count() =', 59761474)


In [43]:
%time train_set_df.show(3)

+----------+------+-------+----------------+-----------------+-----------+-------------+--------------------+---------------+---------+------------+-------------------+------------------+--------------+-----------------+---------------+--------------+------------------+-----------+---------+-------------+-------------------+---------------+-------------------+-------------------+----------------------+------------------+--------------+--------------+------------+-----------+---------------+-----------------+------+
|display_id| ad_id|clicked|dummyClicksTrain|document_id_promo|campaign_id|advertiser_id|dummyPromotedContent|document_id_doc|source_id|publisher_id|       publish_time|dummyDocumentsMeta|    uuid_event|document_id_event|timestamp_event|platform_event|geo_location_event|dummyEvents|day_event|event_country|event_country_state|document_id_doc|source_id_doc_event|publisher_doc_event|publish_time_doc_event|dummyDocumentsMeta|       uuid_pv|document_id_pv|timestamp_pv|platform_pv|

## training models

In [41]:
# 결측값 확인하는 함수 (value가 없거나, value의 str 길이가 0일 때)
def is_null(value):
    return value == None or len(str(value).strip()) == 0

In [42]:
LESS_SPECIAL_CAT_VALUE = 'less'
def get_category_field_values_counts(field, df, min_threshold=10):
    category_counts = dict(list(filter(lambda x: not is_null(x[0]) and x[1] >= min_threshold, 
                                       df.select(field).groupBy(field).count().rdd.map(lambda x: (x[0], x[1])).collect())))
    #Adding a special value to create a feature for values in this category that are less than min_threshold 
    category_counts[LESS_SPECIAL_CAT_VALUE] = -1
    return category_counts
  
# get_category_field_values_counts() -> value가 10개 이상인 카테고리 개수 출력
# spark 문법 : filter, rdd, rdd.map, collect
# filter : filter()로 전달된 함수의 조건에 통과한 값만 리턴
# rdd : 스파크의 기본적인 데이터 단위
# rdd.map : rdd의 각 요소에 함수를 적용하고 결과 rdd를 리턴
# collect : rdd의 모든 데이터 리턴
# min_threshold : 10개 이상인 카테고리만 인정

## Building category values counters and indexers

In [43]:
# events의 geo_location에 기재된 국가 카테고리 개수 (value가 10개 이상인)
event_country_values_counts = get_category_field_values_counts('event_country', events_df, min_threshold=10)
len(event_country_values_counts)
#All non-null categories: 230

222

In [44]:
# events의 geo_location에 기재된 국가-주 카테고리 개수 (value가 10개 이상인)
event_country_state_values_counts = get_category_field_values_counts('event_country_state', events_df, min_threshold=10)
len(event_country_state_values_counts)

1892

In [45]:
# events의 geo_location 전체 개수 (value가 10개 이상인)
event_geo_location_values_counts = get_category_field_values_counts('geo_location_event', events_df, min_threshold=10)
len(event_geo_location_values_counts)
#All non-null categories: 2988

2273

In [46]:
# documents_entities의 entity_id 전체 개수 (value가 10개 이상인)
doc_entity_id_values_counts = get_category_field_values_counts('entity_id', documents_entities_df, min_threshold=10)
len(doc_entity_id_values_counts)
#All non-null categories: 1326009

52439

## Processing average CTR by categories

In [47]:
def get_percentiles(df, field, quantiles_levels=None, max_error_rate=0.0):
    if quantiles_levels == None:
        quantiles_levels = np.arange(0.0, 1.1, 0.1).tolist() # 0.0, 0.1,..., 1.0
    quantiles = df.approxQuantile(field, quantiles_levels, max_error_rate)
    return dict(zip(quantiles_levels, quantiles))
  
# approxQuantile() : Calculates the approximate quantiles of a numerical column of a DataFrame
# dict(zip()) : {'food': 'spam', 'age': 42, 'name': 'Monty'}
# quantile: value 형태로 반환
# udf는 우리가 필요한 새로운 컬럼 기반의 함수를 만들어준다. spark에서 사용자 정의 함수는 functions 객체의 udf 함수로 생성한다

In [48]:
# REG = 10
REG = 0
ctr_udf = F.udf(lambda clicks, views: clicks / float(views + REG), FloatType())

# ctr_udf : ctr을 구하는 사용자 정의 함수
# REG : Regularized Click Probability, 아래 링크 참고
# penalizes ads with small amounts of data, making it prefer an ad with large amounts of training data and a reliable probability
# http://cs229.stanford.edu/proj2016/report/Guo-PredictCommercialPromotedContentsWillBeClickedByUser-report.pdf

## Average CTR by ad_id

In [49]:
ad_id_popularity_df = train_set_df.groupby('ad_id').agg(F.sum('clicked').alias('clicks'), 
                                                               F.count('*').alias('views')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))
# F.count('*') : 그룹에 포함된 항목 개수를 반환
# withColumn(x, y) : x는 컬럼명, y는 컬럼(형태)
# ad_id_popularity_df는, ad_id로 묶고 clicks와 views 계산 -> ad_id 별로 ctr 구함

In [72]:
ad_id_popularity_df.show(3)

+------+------+-----+----------+
| ad_id|clicks|views|       ctr|
+------+------+-----+----------+
|235447|   923|13842|0.06668112|
|134205|    78|  244|0.31967214|
|324400|     9|   87|0.10344828|
+------+------+-----+----------+
only showing top 3 rows

+------+------+-----+----------+
| ad_id|clicks|views|       ctr|
+------+------+-----+----------+
|235447|   923|13842|0.06668112|
|134205|    78|  244|0.31967214|
|324400|     9|   87|0.10344828|
+------+------+-----+----------+
only showing top 3 rows



In [50]:
# ad_id_popularity_df 요소 개수 반환
ad_id_popularity_df.count()

418295

In [348]:
#get_percentiles(ad_id_popularity_df, 'clicks')

In [349]:
#get_percentiles(ad_id_popularity_df, 'views')

In [51]:
# ad_id_popularity는 ad_id별 ctr을 가지고 있는 ad_id_popularity_df에서 views 수가 5 이상인 것 추출
# ad_id를 key로 (ctr, views, 1, 1) 반환

ad_id_popularity = ad_id_popularity_df.filter('views > 5').select('ad_id', 'ctr', 'views') \
                    .rdd.map(lambda x: (x['ad_id'], (x['ctr'], x['views'], 1, 1))).collectAsMap()

In [52]:
# sc.broadcast : 스파크는 효과적인 브로드캐스트 알고리즘을 이용하여 커뮤니케이션의 비용을 줄이는 방향으로 분배를 시도한다

ad_id_popularity_broad = sc.broadcast(ad_id_popularity)

In [241]:
list(ad_id_popularity.keys())[:3]

[2, 3, 4]

In [236]:
# ad_id_popularity의 value들을 리스트로 담고 최초 3개 출력

list(ad_id_popularity.values())[:3]

[(0.07692307978868484, 13, 1, 1),
 (0.08661417663097382, 127, 1, 1),
 (0.11999999731779099, 25, 1, 1)]

In [242]:
# ad_id_popularity의 길이 반환

len(ad_id_popularity)

192107

In [243]:
#get_ad_id_ctr_udf = F.udf(lambda ad_id: ad_id_popularity[ad_id] if ad_id in ad_id_popularity else -1, FloatType())

In [53]:
# ad_id 전체의 avg_ctr, ad_id별 ctr을 다 더해서 전체 개수로 나눔
# map(lambda()) 형태의 이해 ↓
# Input : li = [1, 2, 3]
# Output : result = [1, 4, 9]
# 풀이 : result = list(map(lambda i: i ** 2 , li))

ad_id_avg_ctr = sum(map(lambda x: x[0], ad_id_popularity.values())) / float(len(ad_id_popularity))
ad_id_avg_ctr

0.1552830593979102

In [54]:
# ad_id의 weighted_avg_ctr

ad_id_weighted_avg_ctr = sum(map(lambda x: x[0]*x[1], ad_id_popularity.values())) / float(sum(map(lambda x: x[1], ad_id_popularity.values())))
ad_id_weighted_avg_ctr

0.19405337738275

In [55]:
ad_id_views_median = np.median(np.array(list(map(lambda x: x[1], ad_id_popularity.values()))))
ad_id_views_median

18.0

In [56]:
ad_id_views_mean = sum(map(lambda x: x[1], ad_id_popularity.values())) / float(len(ad_id_popularity))
ad_id_views_mean

308.59291436543174

## Average CTR by document_id (promoted_content)

In [57]:
document_id_popularity_df = train_set_df.groupby('document_id_promo').agg(F.sum('clicked').alias('clicks'), 
                                                               F.count('*').alias('views'),
                                                               F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))
    
document_id_popularity = document_id_popularity_df.filter('views > 5').select('document_id_promo', 'ctr', 'views', 'distinct_ad_ids') \
                                                .rdd.map(lambda x: (x['document_id_promo'], (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()
len(document_id_popularity)

74766

In [63]:
document_id_popularity_df.show(3)

+-----------------+------+-----+---------------+----------+
|document_id_promo|clicks|views|distinct_ad_ids|       ctr|
+-----------------+------+-----+---------------+----------+
|          1071762|   133|  329|             19|0.40425533|
|           959407|    10|  121|             14|0.08264463|
|           971908|  1472| 8640|             37|0.17037037|
+-----------------+------+-----+---------------+----------+
only showing top 3 rows



In [58]:
document_id_popularity_broad = sc.broadcast(document_id_popularity)

In [360]:
#document_id_popularity_df.count()

In [361]:
#get_percentiles(document_id_popularity_df, 'clicks')

In [362]:
#get_percentiles(document_id_popularity_df, 'views')

In [59]:
document_id_avg_ctr = sum(map(lambda x: x[0], document_id_popularity.values())) / float(len(document_id_popularity))
document_id_avg_ctr

0.15048812200823822

In [60]:
document_id_weighted_avg_ctr = sum(list(map(lambda x: x[0]*x[1], document_id_popularity.values()))) / float(sum(list(map(lambda x: x[1], document_id_popularity.values()))))
document_id_weighted_avg_ctr

0.19380676920446974

In [61]:
document_id_views_median = np.median(np.array(list(map(lambda x: x[1], document_id_popularity.values()))))
document_id_views_median

28.0

In [62]:
document_id_views_mean = sum(map(lambda x: x[1], document_id_popularity.values())) / float(len(document_id_popularity))
document_id_views_mean

797.3970253858706

## Average CTR by (doc_event, doc_ad)

In [70]:
doc_event_doc_ad_avg_ctr_df = train_set_df.groupBy('document_id_event', 'document_id_promo') \
                                    .agg(F.sum('clicked').alias('clicks'), 
                                         F.count('*').alias('views'),
                                         F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                    .withColumn('ctr', ctr_udf('clicks','views'))
        
doc_event_doc_ad_avg_ctr = doc_event_doc_ad_avg_ctr_df.filter('views > 5') \
                    .select('document_id_event', 'document_id_promo','ctr', 'views', 'distinct_ad_ids') \
                    .rdd.map(lambda x: ((x['document_id_event'], x['document_id_promo']), (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()        

len(doc_event_doc_ad_avg_ctr)

1302421

1302421

In [73]:
doc_event_doc_ad_avg_ctr_df.show(3)

+-----------------+-----------------+------+-----+---------------+----------+
|document_id_event|document_id_promo|clicks|views|distinct_ad_ids|       ctr|
+-----------------+-----------------+------+-----+---------------+----------+
|           829678|          1060089|     6|   25|              2|      0.24|
|          1586626|          1705546|    45|  210|              1|0.21428572|
|          2054862|          1409794|    16|  676|              1|0.02366864|
+-----------------+-----------------+------+-----+---------------+----------+
only showing top 3 rows

+-----------------+-----------------+------+-----+---------------+----------+
|document_id_event|document_id_promo|clicks|views|distinct_ad_ids|       ctr|
+-----------------+-----------------+------+-----+---------------+----------+
|           829678|          1060089|     6|   25|              2|      0.24|
|          1586626|          1705546|    45|  210|              1|0.21428572|
|          2054862|          1409794|  

In [79]:
doc_event_doc_ad_avg_ctr_broad = sc.broadcast(doc_event_doc_ad_avg_ctr)

## Average CTR by country, source_id

In [80]:
source_id_by_country_popularity_df = train_set_df.select('clicked', 'source_id', 'event_country', 'ad_id') \
                                            .groupby('event_country', 'source_id').agg(F.sum('clicked').alias('clicks'), 
                                                                             F.count('*').alias('views'),
                                                                             F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))
        
#source_id_popularity = source_id_popularity_df.filter('views > 100 and source_id is not null').select('source_id', 'ctr').rdd.collectAsMap()
source_id_by_country_popularity = source_id_by_country_popularity_df.filter('views > 5 and source_id is not null and event_country <> ""').select('event_country', 'source_id', 'ctr', 'views', 'distinct_ad_ids') \
        .rdd.map(lambda x: ((x['event_country'], x['source_id']), (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()
len(source_id_by_country_popularity)

29856

In [98]:
list(source_id_by_country_popularity.keys())[:10]

[(u'CO', 13256),
 (u'EG', 9414),
 (u'MX', 6560),
 (u'FI', 11808),
 (u'BD', 144),
 (u'KR', 2413),
 (u'PH', 10029),
 (u'GB', 11739),
 (u'US', 10127),
 (u'CN', 3421)]

In [99]:
list(source_id_by_country_popularity.values())[:10]

[(0.28018224239349365, 439, 3, 1),
 (0.2083333283662796, 24, 1, 1),
 (0.24137930572032928, 29, 1, 1),
 (0.08588957041501999, 163, 9, 1),
 (0.056338027119636536, 71, 2, 1),
 (0.23880596458911896, 67, 12, 1),
 (0.12181111425161362, 5919, 34, 1),
 (0.10822510719299316, 231, 3, 1),
 (0.2380952388048172, 21, 3, 1),
 (0.0, 12, 1, 1)]

In [81]:
source_id_by_country_popularity_broad = sc.broadcast(source_id_by_country_popularity)

In [82]:
source_id_by_country_avg_ctr = sum(map(lambda x: x[0], source_id_by_country_popularity.values())) / float(len(source_id_by_country_popularity))
source_id_by_country_avg_ctr

0.18602978924640837

In [83]:
source_id_by_country_weighted_avg_ctr = sum(map(lambda x: x[0]*x[1], source_id_by_country_popularity.values())) / float(sum(map(lambda x: x[1], source_id_by_country_popularity.values())))
source_id_by_country_weighted_avg_ctr

0.19364919749745055

In [84]:
source_id_by_country_views_median = np.median(np.array(list(map(lambda x: x[1], source_id_by_country_popularity.values()))))
source_id_by_country_views_median

38.0

In [85]:
source_id_by_country_views_mean = sum(map(lambda x: x[1], source_id_by_country_popularity.values())) / float(len(source_id_by_country_popularity))
source_id_by_country_views_mean

1999.1523311897106

## Average CTR by source_id

In [86]:
source_id_popularity_df = train_set_df.select('clicked', 'source_id', 'ad_id') \
                                            .groupby('source_id').agg(F.sum('clicked').alias('clicks'), 
                                                                     F.count('*').alias('views'),
                                                                     F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))
        
source_id_popularity = source_id_popularity_df.filter('views > 10 and source_id is not null').select('source_id', 'ctr', 'views', 'distinct_ad_ids') \
                            .rdd.map(lambda x: (x['source_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()
len(source_id_popularity)

5628

In [87]:
source_id_popularity_broad = sc.broadcast(source_id_popularity)

In [100]:
list(source_id_popularity.keys())[:10]

[2, 3, 4, 5, 6, 7, 9, 10, 13, 14]

In [101]:
list(source_id_popularity.values())[:10]

[(0.185546875, 512, 12, 1),
 (0.2708333432674408, 48, 7, 1),
 (0.1348569095134735, 9714, 94, 1),
 (0.10801393538713455, 1435, 23, 1),
 (0.2247454673051834, 54119, 53, 1),
 (0.20213228464126587, 13788, 39, 1),
 (0.115292027592659, 19108, 129, 1),
 (0.1080050840973854, 787, 24, 1),
 (0.18909889459609985, 11375, 6, 1),
 (0.07083333283662796, 480, 3, 1)]

In [223]:
#source_id_popularity_df.count()

In [None]:
#get_percentiles(source_id_popularity_df, 'clicks')

In [None]:
#get_percentiles(source_id_popularity_df, 'views')

In [None]:
#source_id_popularity = source_id_popularity_df.filter('views > 100 and source_id is not null').select('source_id', 'ctr').rdd.collectAsMap()

## Average CTR by publisher_id

In [88]:
publisher_popularity_df = train_set_df.select('clicked', 'publisher_id', 'ad_id') \
                                            .groupby('publisher_id').agg(F.sum('clicked').alias('clicks'), 
                                                                             F.count('*').alias('views'),
                                                                              F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))
        
publisher_popularity = publisher_popularity_df.filter('views > 10 and publisher_id is not null').select('publisher_id', 'ctr', 'views', 'distinct_ad_ids') \
                            .rdd.map(lambda x: (x['publisher_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()
len(publisher_popularity)

723

In [89]:
publisher_popularity_broad = sc.broadcast(publisher_popularity)

In [104]:
list(publisher_popularity.keys())[:10]

[2, 3, 4, 5, 6, 9, 11, 12, 13, 14]

In [103]:
list(publisher_popularity.values())[:10]

[(0.20435592532157898, 20386, 58, 1),
 (0.2170737385749817, 12733, 9, 1),
 (0.20462098717689514, 6319, 10, 1),
 (0.46600332856178284, 11457, 59, 1),
 (0.13055281341075897, 8104, 32, 1),
 (0.2990143597126007, 172373, 5895, 1),
 (0.0833333358168602, 12, 1, 1),
 (0.1080050840973854, 787, 24, 1),
 (0.28918322920799255, 453, 3, 1),
 (0.24315619468688965, 1242, 1, 1)]

In [None]:
#publisher_popularity_df.count()
##863

In [None]:
#get_percentiles(publisher_popularity_df, 'clicks')

In [None]:
#get_percentiles(publisher_popularity_df, 'views')

In [None]:
#publisher_id_popularity = publisher_popularity_df.filter('views > 100 and publisher_id is not null').select('publisher_id', 'ctr').rdd.collectAsMap()
#len(publisher_id_popularity)
##639

## Average CTR by advertiser_id

In [90]:
advertiser_id_popularity_df = train_set_df.select('clicked', 'advertiser_id', 'ad_id') \
                                            .groupby('advertiser_id').agg(F.sum('clicked').alias('clicks'), 
                                                                          F.count('*').alias('views'),
                                                                          F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))
        
advertiser_id_popularity = advertiser_id_popularity_df.filter('views > 10 and advertiser_id is not null').select('advertiser_id', 'ctr', 'views', 'distinct_ad_ids') \
                            .rdd.map(lambda x: (x['advertiser_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()
len(advertiser_id_popularity)

3620

In [91]:
advertiser_id_popularity_broad = sc.broadcast(advertiser_id_popularity)

In [106]:
list(advertiser_id_popularity.keys())[:10]

[2, 3, 4, 5, 6, 7, 8, 9, 10, 12]

In [107]:
list(advertiser_id_popularity.values())[:10]

[(0.06451612710952759, 31, 2, 1),
 (0.17577283084392548, 5564, 12, 1),
 (0.2574625015258789, 21005, 153, 1),
 (0.19177444279193878, 143820, 41, 1),
 (0.1421319842338562, 197, 14, 1),
 (0.20100592076778412, 5567, 87, 1),
 (0.2995377779006958, 32884, 440, 1),
 (0.3259281814098358, 26423, 224, 1),
 (0.2201203852891922, 1163, 6, 1),
 (0.31559163331985474, 63053, 18, 1)]

In [None]:
#advertiser_id_popularity_df.count()
##4063

In [None]:
#get_percentiles(advertiser_id_popularity_df, 'clicks')

In [None]:
#get_percentiles(advertiser_id_popularity_df, 'views')

In [None]:
#advertiser_id_popularity = advertiser_id_popularity_df.filter('views > 100 and advertiser_id is not null').select('advertiser_id', 'ctr').rdd.collectAsMap()
#len(advertiser_id_popularity)
##3129

## Average CTR by campaign_id

In [92]:
campaign_id_popularity_df = train_set_df.select('clicked', 'campaign_id', 'ad_id') \
                                            .groupby('campaign_id').agg(F.sum('clicked').alias('clicks'), 
                                                                        F.count('*').alias('views'),
                                                                        F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))
        
campaign_id_popularity = campaign_id_popularity_df.filter('views > 10 and campaign_id is not null').select('campaign_id', 'ctr', 'views', 'distinct_ad_ids') \
                            .rdd.map(lambda x: (x['campaign_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()
len(campaign_id_popularity)

25270

In [93]:
campaign_id_popularity_broad = sc.broadcast(campaign_id_popularity)

In [108]:
list(campaign_id_popularity.keys())[:10]

[1, 2, 3, 4, 5, 6, 7, 8, 10, 11]

In [109]:
list(campaign_id_popularity.values())[:10]

[(0.23687681555747986, 3791, 28, 1),
 (0.11906449496746063, 1411, 54, 1),
 (0.12451361864805222, 257, 2, 1),
 (0.2747933864593506, 6292, 201, 1),
 (0.02298850566148758, 87, 12, 1),
 (0.3658314347267151, 2195, 4, 1),
 (0.03232758492231369, 464, 8, 1),
 (0.09447415173053741, 561, 23, 1),
 (0.05645161122083664, 124, 19, 1),
 (0.2094017118215561, 234, 15, 1)]

In [None]:
#campaign_id_popularity_df.count()
##31390

In [None]:
#get_percentiles(campaign_id_popularity_df, 'clicks')

In [None]:
#get_percentiles(campaign_id_popularity_df, 'views')

In [None]:
#campaign_id_popularity = campaign_id_popularity_df.filter('views > 100 and campaign_id is not null').select('campaign_id', 'ctr').rdd.collectAsMap()
#len(campaign_id_popularity)
##16097

## Average CTR by category

In [94]:
category_id_popularity_df = train_set_df.join(documents_categories_df.alias('cat_local'), on=F.col("document_id_promo") == F.col("cat_local.document_id_cat"), how='inner') \
                                        .select('clicked', 'category_id', 'confidence_level_cat', 'ad_id') \
                                        .groupby('category_id').agg(F.sum('clicked').alias('clicks'), 
                                                                    F.count('*').alias('views'),
                                                                    F.mean('confidence_level_cat').alias('avg_confidence_level_cat'),
                                                                    F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))
            
category_id_popularity = category_id_popularity_df.filter('views > 10').select('category_id', 'ctr', 'views', 'avg_confidence_level_cat', 'distinct_ad_ids') \
                            .rdd.map(lambda x: (x['category_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_cat']))).collectAsMap()
len(category_id_popularity)

95

In [95]:
category_id_popularity_broad = sc.broadcast(category_id_popularity)

In [272]:
list(category_id_popularity.keys())[:10]

[2100, 1600, 1602, 1603, 1604, 1605, 1606, 1607, 1608, 1609]

In [271]:
list(category_id_popularity.values())[:10]

[(0.20317520201206207, 2629381, 25586, 0.4870458696523287),
 (0.2013949453830719, 10323, 44, 0.5),
 (0.2029319703578949, 1008811, 9187, 0.3881761040784007),
 (0.1313016414642334, 727310, 5295, 0.2439268331086716),
 (0.17418651282787323, 920519, 6850, 0.5333781843415426),
 (0.25920549035072327, 78513, 610, 0.5965200782357787),
 (0.2357751578092575, 692503, 4638, 0.5506155645951468),
 (0.24124911427497864, 178782, 2234, 0.5543352849187084),
 (0.21674630045890808, 1220676, 6280, 0.4995168007977859),
 (0.19554254412651062, 2172586, 13408, 0.4344011978646628)]

In [110]:
# 전체 views의 중앙값
np.median(np.array(list(map(lambda x: x[1], category_id_popularity.values()))))

692503.0

In [111]:
# 전체 views의 평균, 모든 views를 다 더해서 개수로 나눔
sum(map(lambda x: x[1], category_id_popularity.values())) / float(len(category_id_popularity))

1246354.6736842105

In [278]:
1246354.6736842105/692503.0

1.7997823456132471

In [92]:
#Parece haver uma hierarquia nas categorias pelo padrão dos códigos...
#기본적으로 카테고리에 계층 구조가 있는 것 같습니다
#category_id_popularity

## Average CTR by (country, category)

In [112]:
category_id_by_country_popularity_df = train_set_df.join(documents_categories_df.alias('cat_local'), on=F.col("document_id_promo") == F.col("cat_local.document_id_cat"), how='inner') \
                                        .select('clicked', 'category_id', 'confidence_level_cat', 'event_country', 'ad_id') \
                                        .groupby('event_country','category_id').agg(F.sum('clicked').alias('clicks'), 
                                                                                    F.count('*').alias('views'),
                                                                                    F.mean('confidence_level_cat').alias('avg_confidence_level_cat'),
                                                                                    F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))

category_id_by_country_popularity = category_id_by_country_popularity_df.filter('views > 10 and event_country <> ""').select('event_country', 'category_id', 'ctr', 'views', 'avg_confidence_level_cat', 'distinct_ad_ids') \
                                     .rdd.map(lambda x: ((x['event_country'], x['category_id']), (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_cat']))).collectAsMap()
len(category_id_by_country_popularity)

10987

In [113]:
list(category_id_by_country_popularity.keys())[:10]

[(u'CN', 1406),
 (u'SZ', 1205),
 (u'VI', 1806),
 (u'LB', 1512),
 (u'NZ', 1208),
 (u'GY', 1209),
 (u'JO', 1902),
 (u'BD', 1612),
 (u'VE', 1710),
 (u'AP', 1912)]

In [114]:
list(category_id_by_country_popularity.values())[:10]

[(0.2261904776096344, 84, 35, 0.3214457171658675),
 (0.0972222238779068, 144, 65, 0.6175835872482922),
 (0.44999998807907104, 20, 12, 0.3357881270349026),
 (0.30000001192092896, 30, 6, 0.1443000006179015),
 (0.25438597798347473, 684, 9, 0.08283828371013814),
 (0.12121212482452393, 66, 26, 0.7296096152541313),
 (0.19780220091342926, 91, 3, 0.5724362015396685),
 (0.31991294026374817, 919, 38, 0.1952407232630629),
 (0.5, 12, 1, 0.07000000029802322),
 (0.1538461595773697, 13, 8, 0.3252715221964396)]

In [115]:
category_id_by_country_popularity_broad = sc.broadcast(category_id_by_country_popularity)

## Average CTR by Topic

In [116]:
topic_id_popularity_df = train_set_df.join(documents_topics_df.alias('top_local'), on=F.col("document_id_promo") == F.col("top_local.document_id_top"), how='inner') \
                                        .select('clicked', 'topic_id', 'confidence_level_top', 'ad_id') \
                                        .groupby('topic_id').agg(F.sum('clicked').alias('clicks'), 
                                                                 F.count('*').alias('views'),
                                                                 F.mean('confidence_level_top').alias('avg_confidence_level_top'),
                                                                 F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))
topic_id_popularity = topic_id_popularity_df.filter('views > 10').select('topic_id', 'ctr', 'views', 'avg_confidence_level_top', 'distinct_ad_ids') \
                            .rdd.map(lambda x: (x['topic_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_top']))).collectAsMap()
len(topic_id_popularity)

300

In [117]:
topic_id_popularity_broad = sc.broadcast(topic_id_popularity)

In [118]:
list(topic_id_popularity.keys())[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [119]:
list(topic_id_popularity.values())[:10]

[(0.22373904287815094, 453287, 2647, 0.12233456528386759),
 (0.18213681876659393, 504714, 3267, 0.07418431459465535),
 (0.16457995772361755, 556939, 5764, 0.09277217948046136),
 (0.2484738975763321, 109593, 1110, 0.1088232609985956),
 (0.25570839643478394, 98933, 951, 0.05725601736188657),
 (0.12132325023412704, 289046, 2309, 0.013733238268172656),
 (0.2379174381494522, 181046, 1129, 0.16137225879193712),
 (0.15632249414920807, 62934, 523, 0.12299769651588695),
 (0.2085820436477661, 1491159, 19499, 0.05025825825301043),
 (0.1834765374660492, 241148, 2707, 0.05404990259875808)]

In [120]:
# 전체 토픽 평균 views : 전체 토픽 views의 합 / 전체 토픽 개수
sum(map(lambda x: x[1], topic_id_popularity.values())) / float(len(topic_id_popularity))

526640.4966666667

In [121]:
# views * distinct_ad_ids의 합 / 전체 토픽 개수
sum(map(lambda x: x[2]*x[1], topic_id_popularity.values())) / float(len(topic_id_popularity))

6998657300.896667

## Average CTR by (country, topic)

In [123]:
topic_id_by_country_popularity_df = train_set_df.join(documents_topics_df.alias('top_local'), on=F.col("document_id_promo") == F.col("top_local.document_id_top"), how='inner') \
                                        .select('clicked', 'topic_id', 'confidence_level_top','event_country', 'ad_id') \
                                        .groupby('event_country','topic_id').agg(F.sum('clicked').alias('clicks'), 
                                                                             F.count('*').alias('views'),
                                                                             F.mean('confidence_level_top').alias('avg_confidence_level_top'),
                                                                             F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))
            
topic_id_id_by_country_popularity = topic_id_by_country_popularity_df.filter('views > 10 and event_country <> ""').select('event_country', 'topic_id', 'ctr', 'views', 'avg_confidence_level_top', 'distinct_ad_ids') \
                            .rdd.map(lambda x: ((x['event_country'], x['topic_id']), (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_top']))).collectAsMap()
len(topic_id_id_by_country_popularity)

33071

In [125]:
list(topic_id_id_by_country_popularity.keys())[:10]

[(u'GA', 249),
 (u'GH', 101),
 (u'DE', 291),
 (u'CL', 133),
 (u'PA', 140),
 (u'UA', 3),
 (u'LR', 82),
 (u'BM', 198),
 (u'ZM', 150),
 (u'BY', 140)]

In [126]:
list(topic_id_id_by_country_popularity.values())[:10]

[(0.0833333358168602, 12, 9, 0.0907062344873945),
 (0.08361203968524933, 299, 10, 0.21701358435644355),
 (0.1539115607738495, 1176, 62, 0.053913121271663075),
 (0.4545454680919647, 11, 5, 0.01054142720320008),
 (0.19164618849754333, 407, 89, 0.18767972210655562),
 (0.38461539149284363, 13, 4, 0.06967360698259793),
 (0.2666666805744171, 15, 6, 0.014811059397955736),
 (0.14601770043373108, 226, 52, 0.13805182779669367),
 (0.4516128897666931, 31, 11, 0.013377583525594204),
 (0.2586206793785095, 58, 35, 0.17077118922667256)]

In [127]:
topic_id_id_by_country_popularity_broad = sc.broadcast(topic_id_id_by_country_popularity)

## Average CTR by Entity

In [128]:
entity_id_popularity_df = train_set_df.join(documents_entities_df.alias('ent_local'), on=F.col("document_id_promo") == F.col("ent_local.document_id_ent"), how='inner') \
                                        .select('clicked', 'entity_id', 'confidence_level_ent', 'ad_id') \
                                        .groupby('entity_id').agg(F.sum('clicked').alias('clicks'), 
                                                                  F.count('*').alias('views'),
                                                                  F.mean('confidence_level_ent').alias('avg_confidence_level_ent'),
                                                                  F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))

entity_id_popularity = entity_id_popularity_df.filter('views > 5').select('entity_id', 'ctr', 'views', 'avg_confidence_level_ent', 'distinct_ad_ids') \
                                     .rdd.map(lambda x: (x['entity_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_ent']))).collectAsMap()
len(entity_id_popularity)

78120

In [129]:
entity_id_popularity_broad = sc.broadcast(entity_id_popularity)

In [130]:
list(entity_id_popularity.keys())[:10]

[u'3e8ea3b1759314b629ccd4cbd059a06f',
 u'b4dc3ba36d4252202188f7c03cc69f64',
 u'a6fbaeedbce3404c1c54e9f55608fe7e',
 u'945cfbbb25dd0d3d9ffa685272bd10cd',
 u'd7da908ad6f7afb1f783429230b0cf7e',
 u'b1139aedfdb8e2226e5a6c89253883f4',
 u'7c006c07568c0c2f6e47a3df621f137a',
 u'8c8deecf1708520f50cee5311b3c4ff5',
 u'1fe460f6b59600823c9c9b2938d073bb',
 u'169ce6cd2c77c3b017c1495f3fb4b00a']

In [131]:
list(entity_id_popularity.values())[:10]

[(0.13333334028720856, 15, 1, 0.7421413660049438),
 (0.16304348409175873, 92, 6, 0.24286991357803345),
 (0.0, 9, 1, 0.9145960211753845),
 (0.2222222238779068, 45, 4, 0.6937966810332404),
 (0.23999999463558197, 25, 6, 0.25364906430244444),
 (0.13786764442920685, 544, 2, 0.38677939772605896),
 (0.0476190485060215, 21, 4, 0.5017787218093872),
 (0.125, 8, 1, 0.3097444176673889),
 (0.125, 8, 3, 0.5733535885810852),
 (0.17910447716712952, 67, 5, 0.2566896065402387)]

In [132]:
# entity별 전체 views의 중앙값
np.median(np.array(list(map(lambda x: x[1], entity_id_popularity.values()))))

48.0

In [291]:
# entity 별 views의 합 / entity 개수, 즉 평균
sum(map(lambda x: x[1], entity_id_popularity.values())) / float(len(entity_id_popularity))

1915.9761776753712

## Average CTR by (country, entity)

In [133]:
entity_id_by_country_popularity_df = train_set_df.join(documents_entities_df.alias('ent_local'), on=F.col("document_id_promo") == F.col("ent_local.document_id_ent"), how='inner') \
                                        .select('clicked', 'entity_id', 'event_country', 'confidence_level_ent','ad_id') \
                                        .groupby('event_country','entity_id').agg(F.sum('clicked').alias('clicks'), 
                                                                             F.count('*').alias('views'),
                                                                             F.mean('confidence_level_ent').alias('avg_confidence_level_ent'),
                                                                             F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))
            
entity_id_by_country_popularity = entity_id_by_country_popularity_df.filter('views > 5 and event_country <> ""').select('event_country', 'entity_id', 'ctr', 'views', 'avg_confidence_level_ent', 'distinct_ad_ids') \
                .rdd.map(lambda x: ((x['event_country'], x['entity_id']), (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_ent']))).collectAsMap()
len(entity_id_by_country_popularity)

217703

In [134]:
list(entity_id_by_country_popularity.keys())[:10]

[(u'SN', u'b55d56364a5c5b0b268321ac5f30552c'),
 (u'LT', u'c0eed48c55b69811f18ac582895cec93'),
 (u'AU', u'2b2470e629e3633deaf5179e8f499e80'),
 (u'GB', u'25f15293eacd8a96042c80e1171210c4'),
 (u'SG', u'966dbe1dca1afe49d5a7a1795213415b'),
 (u'US', u'b61639c0c2a77e87802861097a9a79d2'),
 (u'GB', u'65a87782839eb973ac0cf3dc159c0cd0'),
 (u'LK', u'786f9062879038e9984db1858fc5c8dd'),
 (u'SN', u'fa69c242afc1cf0fe4edb74bae206283'),
 (u'US', u'e4b591efca5fdff326410e9a2a0b0794')]

In [135]:
list(entity_id_by_country_popularity.values())[:10]

[(0.0, 6, 1, 0.3427562117576599),
 (0.1666666716337204, 6, 3, 0.47326989471912384),
 (0.2705882489681244, 85, 8, 0.33241735100746156),
 (0.1304347813129425, 46, 1, 0.31109827756881714),
 (0.380952388048172, 21, 3, 0.825154877844311),
 (0.09090909361839294, 11, 1, 0.833453893661499),
 (0.12666666507720947, 150, 1, 0.2343740165233612),
 (0.12935324013233185, 201, 11, 0.019996220245957375),
 (0.1111111119389534, 9, 1, 0.8727741241455078),
 (0.0, 8, 2, 0.6611085534095764)]

In [136]:
entity_id_by_country_popularity_broad = sc.broadcast(entity_id_by_country_popularity)

## Loading # docs by categories, topics, entities

In [137]:
import cPickle
# import _pickle as cPickle

In [138]:
df_filenames_suffix = ''
if evaluation:
    df_filenames_suffix = '_eval'

In [139]:
# with open('aux_data/categories_docs_counts'+df_filenames_suffix+'.pickle', 'rb') as input_file:
with open('categories_docs_counts'+df_filenames_suffix+'.pickle', 'rb') as input_file:
    categories_docs_counts = cPickle.load(input_file)    
len(categories_docs_counts)

97

In [140]:
# with open('aux_data/topics_docs_counts'+df_filenames_suffix+'.pickle', 'rb') as input_file:
with open('topics_docs_counts'+df_filenames_suffix+'.pickle', 'rb') as input_file:
    topics_docs_counts = cPickle.load(input_file)
len(topics_docs_counts)

300

In [141]:
# with open('aux_data/entities_docs_counts'+df_filenames_suffix+'.pickle', 'rb') as input_file:
with open('entities_docs_counts'+df_filenames_suffix+'.pickle', 'rb') as input_file:
    entities_docs_counts = cPickle.load(input_file)
len(entities_docs_counts)

1326009

In [143]:
list(entities_docs_counts.keys())[:10]

[u'72f364c8af13913d19bd803a3584228c',
 u'87839ac20cc1d1d9f212ba6bcabab3d2',
 u'c5b15375624f83017b24f53a09883a3b',
 u'cd9a81360ff3e85545806c1f4b61932a',
 u'0e61af952934cb5fdcf2772950bc90d5',
 u'61e6c101e169f084ff8badb2846856ee',
 u'1e35fc7b6ee074f40fd49367744b00d6',
 u'e2b787342f945807c058e86616c1a176',
 u'0ff307d67fda165a70301531dbf1bfa3',
 u'e758b2e0d801c21374fa51fff05f71f2']

In [144]:
# 각 entity에 해당하는 document_id의 개수
list(entities_docs_counts.values())[:10]

[3, 1, 32, 1, 2, 1, 1, 3, 1, 1]

In [146]:
documents_total = documents_meta_df.count()
documents_total

2999334

## Exploring Publish Time

In [147]:
publish_times_df = train_set_df.filter('publish_time is not null').select('document_id_promo','publish_time').distinct().select(F.col('publish_time').cast(IntegerType()))
publish_time_percentiles = get_percentiles(publish_times_df, 'publish_time', quantiles_levels=[0.5], max_error_rate=0.001)
publish_time_percentiles

{0.5: 1464109200.0}

In [148]:
publish_time_median = int(publish_time_percentiles[0.5])
datetime.datetime.utcfromtimestamp(publish_time_median)

datetime.datetime(2016, 5, 24, 17, 0)

In [149]:
def get_days_diff(newer_timestamp, older_timestamp):
    sec_diff = newer_timestamp - older_timestamp
    days_diff = sec_diff / 60 / 60 / 24
    return days_diff

def get_time_decay_factor(timestamp, timestamp_ref=None, alpha=0.001):
    if timestamp_ref == None:
        timestamp_ref = time.time()
        
    days_diff = get_days_diff(timestamp_ref, timestamp)
    denominator = math.pow(1+alpha, days_diff)
    if denominator != 0:
        return 1.0 / denominator
    else:
        return 0.0

In [150]:
def convert_odd_timestamp(timestamp_ms_relative):
    TIMESTAMP_DELTA=1465876799998
    return datetime.datetime.fromtimestamp((int(timestamp_ms_relative)+TIMESTAMP_DELTA)//1000)

In [151]:
TIME_DECAY_ALPHA = 0.0005

In [152]:
ref_dates = [
                1476714880, # 7 days
                1474727680, # 30 days
                1469370880, # 90 days
                1461508480,  # 180 days
                1445697280, # 1 year
                1414161280 # 2 years
]

for d in ref_dates:
    print(datetime.datetime.utcfromtimestamp(d), get_time_decay_factor(d, alpha=TIME_DECAY_ALPHA))

(datetime.datetime(2016, 10, 17, 14, 34, 40), 0.6819865993811997)
(datetime.datetime(2016, 9, 24, 14, 34, 40), 0.6741906151274285)
(datetime.datetime(2016, 7, 24, 14, 34, 40), 0.653616396779389)
(datetime.datetime(2016, 4, 24, 14, 34, 40), 0.6245503815378628)
(datetime.datetime(2015, 10, 24, 14, 34, 40), 0.5699535513734337)
(datetime.datetime(2014, 10, 24, 14, 34, 40), 0.474898206273678)


## Get local time

In [153]:
DEFAULT_TZ_EST = -4.0

In [154]:
def get_local_utc_bst_tz(event_country, event_country_state):
    local_tz = DEFAULT_TZ_EST
    if len(event_country) > 0:
        if event_country in countries_utc_dst_broad.value:
            local_tz = countries_utc_dst_broad.value[event_country]
            if len(event_country_state)>2:
                state = event_country_state[3:5]
                if event_country == 'US':  
                    if state in us_states_utc_dst_broad.value:
                        local_tz = us_states_utc_dst_broad.value[state]                
                elif event_country == 'CA':
                    if state in ca_countries_utc_dst_broad.value:
                        local_tz = ca_countries_utc_dst_broad.value[state] 
    return float(local_tz)

In [155]:
hour_bins_dict = {'EARLY_MORNING': 1,
             'MORNING': 2,
             'MIDDAY': 3,
             'AFTERNOON': 4,
             'EVENING': 5,
             'NIGHT': 6}

hour_bins_values = sorted(hour_bins_dict.values())

In [156]:
def get_hour_bin(hour):
    if hour >= 5 and hour < 8:
        hour_bin = hour_bins_dict['EARLY_MORNING']
    elif hour >= 8 and hour < 11:
        hour_bin = hour_bins_dict['MORNING']
    elif hour >= 11 and hour < 14:
        hour_bin = hour_bins_dict['MIDDAY']
    elif hour >= 14 and hour < 19:
        hour_bin = hour_bins_dict['AFTERNOON']
    elif hour >= 19 and hour < 22:
        hour_bin = hour_bins_dict['EVENING']
    else:
        hour_bin = hour_bins_dict['NIGHT']
    return hour_bin

In [157]:
def get_local_datetime(dt, event_country, event_country_state):
    local_tz = get_local_utc_bst_tz(event_country, event_country_state)  
    tz_delta = local_tz - DEFAULT_TZ_EST
    local_time = dt +  datetime.timedelta(hours=tz_delta)
    return local_time

In [158]:
get_local_datetime(datetime.datetime.now(), 'US', 'US>CA')

datetime.datetime(2018, 11, 22, 3, 56, 53, 619146)

In [159]:
def is_weekend(dt):
    return dt.weekday() >= 5

In [160]:
is_weekend(datetime.datetime(2016, 6, 14))

False

## Average CTR functions

In [161]:
timestamp_ref = date_time_to_unix_epoch(datetime.datetime(2016, 6, 29, 3, 59, 59))
decay_factor_default = get_time_decay_factor(publish_time_median, timestamp_ref, alpha=TIME_DECAY_ALPHA)
print("decay_factor_default", decay_factor_default)

('decay_factor_default', 0.9826565333455068)


In [162]:
def get_confidence_sample_size(sample, max_for_reference=100000):
    #Avoiding overflow for large sample size
    if sample >= max_for_reference:
        return 1.0

    ref_log = math.log(1+max_for_reference, 2) # Curiosly reference in log with base 2(밑이 2인 로그) gives a slightly higher score, so I will keep
    
    return math.log(1+sample) / float(ref_log)
    
for i in [0,0.5,1,2,3,4,5,10,20,30,100,200,300,1000,2000,3000,10000,20000,30000, 50000, 90000, 100000, 500000, 900000, 1000000, 2171607]:
    print(i, get_confidence_sample_size(i))

(0, 0.0)
(0.5, 0.024411410743763327)
(1, 0.041731582304281624)
(2, 0.06614299304804495)
(3, 0.08346316460856325)
(4, 0.09689773339641579)
(5, 0.10787457535232657)
(10, 0.14436755531919657)
(20, 0.183298356035222)
(30, 0.20674645107847822)
(100, 0.2778577004917695)
(200, 0.3192904933647466)
(300, 0.34360197720285013)
(1000, 0.41594812296601125)
(2000, 0.4576496248565576)
(3000, 0.48205100545505175)
(10000, 0.5545232830964639)
(20000, 0.5962518553291584)
(30000, 0.6206622626822822)
(50000, 0.6514162003061013)
(90000, 0.6868039178501281)
(100000, 1.0)
(500000, 1.0)
(900000, 1.0)
(1000000, 1.0)
(2171607, 1.0)


In [163]:
def get_popularity(an_id, a_dict):
    return (a_dict[an_id][0], get_confidence_sample_size(a_dict[an_id][1] / float(a_dict[an_id][2])) * a_dict[an_id][3]) if an_id in a_dict else (None, None)

# return (ctr, get_confidence_sample_size(views / distinct_ad_ids) * avg_confidence_level)
# get_confidence_sample_size(views / distinct_ad_ids) * avg_confidence_level 이게 뭘까?

In [168]:
get_confidence_sample_size(504714/3267)

0.30364418447489405

In [167]:
get_popularity(1, topic_id_popularity)

(0.18213681876659393, 0.022539690355972142)

In [164]:
def get_weighted_avg_popularity_from_list(ids_list, confidence_ids_list, pop_dict):
    pops = list(filter(lambda x: x[0][0]!=None, [(get_popularity(an_id, pop_dict), confidence) for an_id, confidence in zip(ids_list, confidence_ids_list)]))
    #print("pops",pops)
    if len(pops) > 0:
        weighted_avg = sum(map(lambda x: x[0][0]*x[0][1]*x[1], pops)) / float(sum(map(lambda x: x[0][1]*x[1], pops)))
        confidence = max(map(lambda x: x[0][1]*x[1], pops))
        return weighted_avg, confidence
    else:
        return None, None

In [165]:
def get_weighted_avg_country_popularity_from_list(event_country, ids_list, confidence_ids_list, pop_dict):
    pops = list(filter(lambda x: x[0][0]!=None, [(get_popularity((event_country, an_id), pop_dict), confidence) for an_id, confidence in zip(ids_list, confidence_ids_list)]))
    
    if len(pops) > 0:
        weighted_avg = sum(map(lambda x: x[0][0]*x[0][1]*x[1], pops)) / float(sum(map(lambda x: x[0][1]*x[1], pops)))
        confidence = max(map(lambda x: x[0][1]*x[1], pops))
        return weighted_avg, confidence
    else:
        return None, None

In [166]:
def get_popularity_score(event_country, ad_id, document_id, source_id, 
                         publisher_id, advertiser_id, campaign_id, document_id_event,
                            category_ids_by_doc, cat_confidence_level_by_doc, 
                            topic_ids_by_doc, top_confidence_level_by_doc,
                            entity_ids_by_doc, ent_confidence_level_by_doc,
                            output_detailed_list=False):
    probs = []
    
    avg_ctr, confidence = get_popularity(ad_id, ad_id_popularity_broad.value)    
    if avg_ctr != None:
        probs.append(('pop_ad_id', avg_ctr, confidence))
        
    avg_ctr, confidence = get_popularity(document_id, document_id_popularity_broad.value)
    if avg_ctr != None:
        probs.append(('pop_document_id', avg_ctr, confidence))  
        
    avg_ctr, confidence = get_popularity((document_id_event, document_id), doc_event_doc_ad_avg_ctr_broad.value)
    if avg_ctr != None:
        probs.append(('pop_doc_event_doc_ad', avg_ctr, confidence))
        
        
    if source_id != -1:
        avg_ctr = None
        if event_country != '':
            avg_ctr, confidence = get_popularity((event_country, source_id), source_id_by_country_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_source_id_country', avg_ctr, confidence))
            
        avg_ctr, confidence = get_popularity(source_id, source_id_popularity_broad.value)        
        if avg_ctr != None:
            probs.append(('pop_source_id', avg_ctr, confidence))
            
            
    if publisher_id != None:
        avg_ctr, confidence = get_popularity(publisher_id, publisher_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_publisher_id', avg_ctr, confidence)) 
            
    if advertiser_id != None:
        avg_ctr, confidence = get_popularity(advertiser_id, advertiser_id_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_advertiser_id', avg_ctr, confidence)) 
    
    if campaign_id != None:
        avg_ctr, confidence = get_popularity(campaign_id, campaign_id_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_campain_id', avg_ctr, confidence))  

    if len(entity_ids_by_doc) > 0: 
        avg_ctr = None
        if event_country != '':
            avg_ctr, confidence = get_weighted_avg_country_popularity_from_list(event_country, entity_ids_by_doc, ent_confidence_level_by_doc, 
                                        entity_id_by_country_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_entity_id_country', avg_ctr, confidence))
            
        avg_ctr, confidence = get_weighted_avg_popularity_from_list(entity_ids_by_doc, ent_confidence_level_by_doc, 
                                                                    entity_id_popularity_broad.value) 
        if avg_ctr != None:
            probs.append(('pop_entity_id', avg_ctr, confidence))
            
    
    
    if len(topic_ids_by_doc) > 0:  
        avg_ctr = None
        if event_country != '':
            avg_ctr, confidence = get_weighted_avg_country_popularity_from_list(event_country, topic_ids_by_doc, top_confidence_level_by_doc, 
                                        topic_id_id_by_country_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_topic_id_country', avg_ctr, confidence))
            
        avg_ctr, confidence = get_weighted_avg_popularity_from_list(topic_ids_by_doc, top_confidence_level_by_doc, 
                                                                    topic_id_popularity_broad.value)            
        if avg_ctr != None:
            probs.append(('pop_topic_id', avg_ctr, confidence))
    
    
    if len(category_ids_by_doc) > 0:  
        avg_ctr = None
        if event_country != '':
            avg_ctr, confidence = get_weighted_avg_country_popularity_from_list(event_country, category_ids_by_doc, cat_confidence_level_by_doc, 
                                        category_id_by_country_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_category_id_country', avg_ctr, confidence))
        
        avg_ctr, confidence = get_weighted_avg_popularity_from_list(category_ids_by_doc, cat_confidence_level_by_doc, 
                                                                    category_id_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_category_id', avg_ctr, confidence))
    
    #print("[get_popularity_score] probs", probs)
    if output_detailed_list:
        return probs
    
    else:    
        if len(probs) > 0:
            #weighted_avg_probs_by_confidence = sum(map(lambda x: x[1] *  math.log(1+x[2],2), probs)) / float(sum(map(lambda x: math.log(1+x[2],2), probs)))        
            weighted_avg_probs_by_confidence = sum(map(lambda x: x[1] * x[2], probs)) / float(sum(map(lambda x: x[2], probs)))                
            confidence = max(map(lambda x: x[2], probs))
            return weighted_avg_probs_by_confidence, confidence
        else:
            return None, None

## Content-Based similarity functions

In [169]:
def cosine_similarity_dicts(dict1, dict2):
    dict1_norm = math.sqrt(sum([v**2 for v in dict1.values()]))
    dict2_norm = math.sqrt(sum([v**2 for v in dict2.values()]))
    
    sum_common_aspects = 0.0
    intersections = 0
    for key in dict1:
        if key in dict2:
            sum_common_aspects += dict1[key] * dict2[key] 
            intersections += 1
        
    return sum_common_aspects / (dict1_norm * dict2_norm), intersections

In [170]:
def cosine_similarity_user_docs_aspects(user_aspect_profile, doc_aspect_ids, doc_aspects_confidence, aspect_docs_counts):
    if user_aspect_profile==None or len(user_aspect_profile) == 0 or doc_aspect_ids == None or len(doc_aspect_ids) == 0:
        return None, None
        
    doc_aspects = dict(zip(doc_aspect_ids, doc_aspects_confidence))
    doc_aspects_tfidf_confid = {}
    for key in doc_aspects:
        tf = 1.0
        idf = math.log(math.log(documents_total / float(aspect_docs_counts[key])))
        confidence = doc_aspects[key]
        doc_aspects_tfidf_confid[key] = tf*idf * confidence
        
    user_aspects_tfidf_confid = {}    
    for key in user_aspect_profile:
        tfidf = user_aspect_profile[key][0]
        confidence = user_aspect_profile[key][1]
        user_aspects_tfidf_confid[key] = tfidf * confidence
        
    similarity, intersections = cosine_similarity_dicts(doc_aspects_tfidf_confid, user_aspects_tfidf_confid)
    
    if intersections > 0:
        #P(A intersect B)_intersections = P(A)^intersections * P(B)^intersections
        random_error = math.pow(len(doc_aspects)         / float(len(aspect_docs_counts)), intersections) * \
                       math.pow(len(user_aspect_profile) / float(len(aspect_docs_counts)), intersections)
        confidence = 1.0 - random_error
    else:
        #P(A not intersect B) = 1 - P(A intersect B)
        random_error = 1 - ((len(doc_aspects) / float(len(aspect_docs_counts))) * \
                            (len(user_aspect_profile) / float(len(aspect_docs_counts))))
    
    confidence = 1.0 - random_error    
    
    return similarity, confidence

In [171]:
def cosine_similarity_doc_event_doc_ad_aspects(doc_event_aspect_ids, doc_event_aspects_confidence, 
                                               doc_ad_aspect_ids, doc_ad_aspects_confidence, 
                                               aspect_docs_counts):
    if doc_event_aspect_ids == None or len(doc_event_aspect_ids) == 0 or \
       doc_ad_aspect_ids == None or len(doc_ad_aspect_ids) == 0:
        return None, None
        
    doc_event_aspects = dict(zip(doc_event_aspect_ids, doc_event_aspects_confidence))
    doc_event_aspects_tfidf_confid = {}
    for key in doc_event_aspect_ids:
        tf = 1.0
        idf = math.log(math.log(documents_total / float(aspect_docs_counts[key])))
        confidence = doc_event_aspects[key]
        doc_event_aspects_tfidf_confid[key] = tf*idf * confidence
        
    doc_ad_aspects = dict(zip(doc_ad_aspect_ids, doc_ad_aspects_confidence))
    doc_ad_aspects_tfidf_confid = {}
    for key in doc_ad_aspect_ids:
        tf = 1.0
        idf = math.log(math.log(documents_total / float(aspect_docs_counts[key])))
        confidence = doc_ad_aspects[key]
        doc_ad_aspects_tfidf_confid[key] = tf*idf * confidence
        
    similarity, intersections = cosine_similarity_dicts(doc_event_aspects_tfidf_confid, doc_ad_aspects_tfidf_confid)
    
    if intersections > 0:
        #P(A intersect B)_intersections = P(A)^intersections * P(B)^intersections
        random_error = math.pow(len(doc_event_aspect_ids) / float(len(aspect_docs_counts)), intersections) * \
                       math.pow(len(doc_ad_aspect_ids) / float(len(aspect_docs_counts)), intersections)
        confidence = 1.0 - random_error
    else:
        #P(A not intersect B) = 1 - P(A intersect B)
        random_error = 1 - ((len(doc_event_aspect_ids) / float(len(aspect_docs_counts))) * \
                            (len(doc_ad_aspect_ids) / float(len(aspect_docs_counts))))
    
    confidence = 1.0 - random_error    
    
    return similarity, confidence

In [172]:
def get_user_cb_interest_score(user_views_count, user_categories, user_topics, user_entities, 
                            timestamp_event, category_ids_by_doc, cat_confidence_level_by_doc, 
                            topic_ids_by_doc, top_confidence_level_by_doc,
                            entity_ids_by_doc, ent_confidence_level_by_doc, 
                            output_detailed_list=False):

    #Content-Based
    
    sims = []
    
    categories_similarity, cat_sim_confidence = cosine_similarity_user_docs_aspects(user_categories, category_ids_by_doc, cat_confidence_level_by_doc, categories_docs_counts)
    if categories_similarity != None:
        sims.append(('user_doc_ad_sim_categories', categories_similarity, cat_sim_confidence))
    
    topics_similarity, top_sim_confidence = cosine_similarity_user_docs_aspects(user_topics, topic_ids_by_doc, top_confidence_level_by_doc, topics_docs_counts)
    if topics_similarity != None:
        sims.append(('user_doc_ad_sim_topics', topics_similarity, top_sim_confidence))
    
    entities_similarity, entity_sim_confid = cosine_similarity_user_docs_aspects(user_entities, entity_ids_by_doc, ent_confidence_level_by_doc, entities_docs_counts)
    if entities_similarity != None:
        sims.append(('user_doc_ad_sim_entities', entities_similarity, entity_sim_confid))
    
    if output_detailed_list:
        return sims
    else:
        if len(sims) > 0:
            weighted_avg_sim_by_confidence = sum(map(lambda x: x[1]*x[2], sims)) / float(sum(map(lambda x: x[2], sims)))
            confidence = sum(map(lambda x: x[2], sims)) / float(len(sims))

            #print("[get_user_cb_interest_score] sims: {} | Avg: {} - Confid: {}".format(sims, weighted_avg_sim_by_confidence, confidence))
            return weighted_avg_sim_by_confidence, confidence
        else:
            return None, None

In [173]:
def get_doc_event_doc_ad_cb_similarity_score(doc_event_category_ids, doc_event_cat_confidence_levels, 
                                             doc_event_topic_ids, doc_event_top_confidence_levels,
                                             doc_event_entity_ids, doc_event_ent_confidence_levels, 
                                             doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                                             doc_ad_topic_ids, doc_ad_top_confidence_levels,
                                             doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                            output_detailed_list=False):

    #Content-Based
    sims = []
    
    
    
    categories_similarity, cat_sim_confidence = cosine_similarity_doc_event_doc_ad_aspects(
                                                    doc_event_category_ids, doc_event_cat_confidence_levels, 
                                                    doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                                                    categories_docs_counts)
    if categories_similarity != None:
        sims.append(('doc_event_doc_ad_sim_categories', categories_similarity, cat_sim_confidence))
    
    topics_similarity, top_sim_confidence = cosine_similarity_doc_event_doc_ad_aspects(
                                                    doc_event_topic_ids, doc_event_top_confidence_levels, 
                                                    doc_ad_topic_ids, doc_ad_top_confidence_levels, 
                                                    topics_docs_counts)
    
    if topics_similarity != None:
        sims.append(('doc_event_doc_ad_sim_topics', topics_similarity, top_sim_confidence))
        
    entities_similarity, entity_sim_confid = cosine_similarity_doc_event_doc_ad_aspects(
                                                    doc_event_entity_ids, doc_event_ent_confidence_levels, 
                                                    doc_ad_entity_ids, doc_ad_ent_confidence_levels, 
                                                    entities_docs_counts)
    
    if entities_similarity != None:
        sims.append(('doc_event_doc_ad_sim_entities', entities_similarity, entity_sim_confid))
    
    if output_detailed_list:
        return sims
    else:
        if len(sims) > 0:
            weighted_avg_sim_by_confidence = sum(map(lambda x: x[1]*x[2], sims)) / float(sum(map(lambda x: x[2], sims)))
            confidence = sum(map(lambda x: x[2], sims)) / float(len(sims))

            #print("[get_user_cb_interest_score] sims: {} | Avg: {} - Confid: {}".format(sims, weighted_avg_sim_by_confidence, confidence))
            return weighted_avg_sim_by_confidence, confidence
        else:
            return None, None

## Feature Vector export

In [174]:
bool_feature_names = ['event_weekend',
                      'user_has_already_viewed_doc']

In [175]:
int_feature_names = ['user_views',
                    'ad_views',
                    'doc_views',
                    'doc_event_days_since_published',
                    'doc_event_hour',
                    'doc_ad_days_since_published', 
                    ]

In [176]:
float_feature_names = [                                
                'pop_ad_id',       
                'pop_ad_id_conf',   
                'pop_ad_id_conf_multipl', 
                'pop_document_id',                
                'pop_document_id_conf',
                'pop_document_id_conf_multipl',
                'pop_publisher_id',
                'pop_publisher_id_conf',
                'pop_publisher_id_conf_multipl',
                'pop_advertiser_id',
                'pop_advertiser_id_conf',
                'pop_advertiser_id_conf_multipl',
                'pop_campain_id',
                'pop_campain_id_conf',
                'pop_campain_id_conf_multipl',
                'pop_doc_event_doc_ad',
                'pop_doc_event_doc_ad_conf',
                'pop_doc_event_doc_ad_conf_multipl',
                'pop_source_id',  
                'pop_source_id_conf',
                'pop_source_id_conf_multipl',
                'pop_source_id_country',
                'pop_source_id_country_conf',
                'pop_source_id_country_conf_multipl',
                'pop_entity_id',    
                'pop_entity_id_conf',
                'pop_entity_id_conf_multipl',
                'pop_entity_id_country',
                'pop_entity_id_country_conf',
                'pop_entity_id_country_conf_multipl',
                'pop_topic_id', 
                'pop_topic_id_conf',
                'pop_topic_id_conf_multipl',
                'pop_topic_id_country',
                'pop_topic_id_country_conf',
                'pop_topic_id_country_conf_multipl',
                'pop_category_id', 
                'pop_category_id_conf',
                'pop_category_id_conf_multipl',
                'pop_category_id_country',
                'pop_category_id_country_conf',
                'pop_category_id_country_conf_multipl',
                'user_doc_ad_sim_categories',    
                'user_doc_ad_sim_categories_conf',
                'user_doc_ad_sim_categories_conf_multipl',
                'user_doc_ad_sim_topics',    
                'user_doc_ad_sim_topics_conf',
                'user_doc_ad_sim_topics_conf_multipl',
                'user_doc_ad_sim_entities',                    
                'user_doc_ad_sim_entities_conf',
                'user_doc_ad_sim_entities_conf_multipl',
                'doc_event_doc_ad_sim_categories',    
                'doc_event_doc_ad_sim_categories_conf',
                'doc_event_doc_ad_sim_categories_conf_multipl',
                'doc_event_doc_ad_sim_topics',    
                'doc_event_doc_ad_sim_topics_conf',
                'doc_event_doc_ad_sim_topics_conf_multipl',
                'doc_event_doc_ad_sim_entities',                    
                'doc_event_doc_ad_sim_entities_conf',
                'doc_event_doc_ad_sim_entities_conf_multipl'
               ]

In [177]:
TRAFFIC_SOURCE_FV='traffic_source'
EVENT_HOUR_FV='event_hour'
EVENT_COUNTRY_FV = 'event_country'
EVENT_COUNTRY_STATE_FV = 'event_country_state'
EVENT_GEO_LOCATION_FV = 'event_geo_location'
EVENT_PLATFORM_FV = 'event_platform'
AD_ADVERTISER_FV = 'ad_advertiser'
DOC_AD_SOURCE_ID_FV='doc_ad_source_id'
DOC_AD_PUBLISHER_ID_FV='doc_ad_publisher_id'
DOC_EVENT_SOURCE_ID_FV='doc_event_source_id'
DOC_EVENT_PUBLISHER_ID_FV='doc_event_publisher_id'
DOC_AD_CATEGORY_ID_FV='doc_ad_category_id'
DOC_AD_TOPIC_ID_FV='doc_ad_topic_id'
DOC_AD_ENTITY_ID_FV='doc_ad_entity_id'
DOC_EVENT_CATEGORY_ID_FV='doc_event_category_id'
DOC_EVENT_TOPIC_ID_FV='doc_event_topic_id'
DOC_EVENT_ENTITY_ID_FV='doc_event_entity_id'

## Configuring feature vector

In [178]:
category_feature_names_integral = ['ad_advertiser',
 'doc_ad_category_id_1',
 'doc_ad_category_id_2',
 'doc_ad_category_id_3',
 'doc_ad_topic_id_1',
 'doc_ad_topic_id_2',
 'doc_ad_topic_id_3',
 'doc_ad_entity_id_1', 
 'doc_ad_entity_id_2', 
 'doc_ad_entity_id_3', 
 'doc_ad_entity_id_4', 
 'doc_ad_entity_id_5', 
 'doc_ad_entity_id_6', 
 'doc_ad_publisher_id',
 'doc_ad_source_id', 
 'doc_event_category_id_1',
 'doc_event_category_id_2',
 'doc_event_category_id_3',
 'doc_event_topic_id_1',
 'doc_event_topic_id_2',
 'doc_event_topic_id_3',
 'doc_event_entity_id_1',
 'doc_event_entity_id_2',
 'doc_event_entity_id_3',
 'doc_event_entity_id_4',
 'doc_event_entity_id_5',
 'doc_event_entity_id_6',
 'doc_event_publisher_id',
 'doc_event_source_id', 
 'event_country',
 'event_country_state',
 'event_geo_location',
 'event_hour',
 'event_platform',
 'traffic_source']


feature_vector_labels_integral = bool_feature_names + int_feature_names + float_feature_names + \
                                 category_feature_names_integral

In [179]:
feature_vector_labels_integral_dict = dict([(key, idx) for idx, key in enumerate(feature_vector_labels_integral)])

In [180]:
with open('feature_vector_labels_integral.txt', 'w') as output:
    output.writelines('\n'.join(feature_vector_labels_integral))

In [181]:
def set_feature_vector_cat_value(field_name, field_value, feature_vector):
    if not is_null(field_value) and str(field_value) != '-1':
        feature_name = get_ohe_feature_name(field_name, field_value)
        if feature_name in feature_vector_labels_dict:
            feature_idx = feature_vector_labels_dict[feature_name]
        else:
            #Unpopular category value
            feature_idx = feature_vector_labels_dict[get_ohe_feature_name(field_name, LESS_SPECIAL_CAT_VALUE)]
            
        feature_vector[feature_idx] = float(1)
        
def set_feature_vector_cat_values(field_name, field_values, feature_vector):
    for field_value in field_values:
        set_feature_vector_cat_value(field_name, field_value, feature_vector)

In [182]:
def get_ad_feature_vector(user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
                            event_country, event_country_state,
                            ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
                            geo_location_event, 
                            doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
                            traffic_source_pv, advertiser_id, publisher_id,
                            campaign_id, document_id_event,
                            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                            doc_ad_topic_ids, doc_ad_top_confidence_levels,
                            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                            doc_event_category_ids, doc_event_cat_confidence_levels,
                            doc_event_topic_ids, doc_event_top_confidence_levels,
                            doc_event_entity_ids, doc_event_ent_confidence_levels):
             
    try:

        feature_vector = {}
        
        if user_views_count != None:
            feature_vector[feature_vector_labels_dict['user_views']] = float(user_views_count)
         
        if user_doc_ids_viewed != None:
            feature_vector[feature_vector_labels_dict['user_has_already_viewed_doc']] = float(document_id in user_doc_ids_viewed)               
          
        if ad_id in ad_id_popularity_broad.value:            
            feature_vector[feature_vector_labels_dict['ad_views']] = float(ad_id_popularity_broad.value[ad_id][1])
        
        if document_id in document_id_popularity_broad.value:
            feature_vector[feature_vector_labels_dict['doc_views']] = float(document_id_popularity_broad.value[document_id][1])            
            
        if timestamp_event > -1:
            dt_timestamp_event = convert_odd_timestamp(timestamp_event)
            if doc_ad_publish_time != None:
                delta_days = (dt_timestamp_event - doc_ad_publish_time).days
                if delta_days >= 0 and delta_days <= 365*10: #10 years
                    feature_vector[feature_vector_labels_dict['doc_ad_days_since_published']] = float(delta_days)
                        
            if doc_event_publish_time != None:
                delta_days = (dt_timestamp_event - doc_event_publish_time).days
                if delta_days >= 0 and delta_days <= 365*10: #10 years
                    feature_vector[feature_vector_labels_dict['doc_event_days_since_published']] = float(delta_days)
                    
            
            #Local period of the day (hours)
            dt_local_timestamp_event = get_local_datetime(dt_timestamp_event, event_country, event_country_state)    
            local_hour_bin = get_hour_bin(dt_local_timestamp_event.hour)            
            feature_vector[feature_vector_labels_dict['doc_event_hour']] = float(local_hour_bin) #Hour for Decision Trees
            set_feature_vector_cat_value(EVENT_HOUR_FV, local_hour_bin, feature_vector) #Period of day for FFM
            
            #Weekend
            weekend = int(is_weekend(dt_local_timestamp_event))
            feature_vector[feature_vector_labels_dict['event_weekend']] = float(weekend)                                                      
        
        conf_field_suffix = '_conf'
        conf_multiplied_field_suffix = '_conf_multipl'
        
        #Setting Popularity fields
        pop_scores = get_popularity_score(event_country, ad_id, document_id, source_id, 
                                publisher_id, advertiser_id, campaign_id, document_id_event,
                                doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                                doc_ad_topic_ids, doc_ad_top_confidence_levels,
                                doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                                output_detailed_list=True)
        
                                

        for score in pop_scores:
            feature_vector[feature_vector_labels_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]

        #Setting User-Doc_ad CB Similarity fields
        user_doc_ad_cb_sim_scores = get_user_cb_interest_score(user_views_count, user_categories, user_topics, user_entities, 
                                timestamp_event, 
                                 doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                                 doc_ad_topic_ids, doc_ad_top_confidence_levels,
                                 doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                                output_detailed_list=True)

        for score in user_doc_ad_cb_sim_scores:
            feature_vector[feature_vector_labels_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]
            
        #Setting Doc_event-doc_ad CB Similarity fields
        doc_event_doc_ad_cb_sim_scores = get_doc_event_doc_ad_cb_similarity_score(
                                            doc_event_category_ids, doc_event_cat_confidence_levels,
                                            doc_event_topic_ids, doc_event_top_confidence_levels,
                                            doc_event_entity_ids, doc_event_ent_confidence_levels,
                                            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                                            doc_ad_topic_ids, doc_ad_top_confidence_levels,
                                            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                                        output_detailed_list=True)
        
        for score in doc_event_doc_ad_cb_sim_scores:
            feature_vector[feature_vector_labels_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]
            
        set_feature_vector_cat_value(TRAFFIC_SOURCE_FV, traffic_source_pv, feature_vector)
        set_feature_vector_cat_value(EVENT_COUNTRY_FV, event_country, feature_vector)
        set_feature_vector_cat_value(EVENT_COUNTRY_STATE_FV, event_country_state, feature_vector)         
        set_feature_vector_cat_value(EVENT_GEO_LOCATION_FV, geo_location_event, feature_vector)
        set_feature_vector_cat_value(EVENT_PLATFORM_FV, platform_event, feature_vector)
        set_feature_vector_cat_value(AD_ADVERTISER_FV, advertiser_id, feature_vector)
        set_feature_vector_cat_value(DOC_AD_SOURCE_ID_FV, source_id, feature_vector)
        set_feature_vector_cat_value(DOC_AD_PUBLISHER_ID_FV, publisher_id, feature_vector)
        set_feature_vector_cat_value(DOC_EVENT_SOURCE_ID_FV, doc_event_source_id, feature_vector)
        set_feature_vector_cat_value(DOC_EVENT_PUBLISHER_ID_FV, doc_event_publisher_id, feature_vector)
        set_feature_vector_cat_values(DOC_AD_CATEGORY_ID_FV, doc_ad_category_ids, feature_vector)
        set_feature_vector_cat_values(DOC_AD_TOPIC_ID_FV, doc_ad_topic_ids, feature_vector)
        set_feature_vector_cat_values(DOC_AD_ENTITY_ID_FV, doc_ad_entity_ids, feature_vector)
        set_feature_vector_cat_values(DOC_EVENT_CATEGORY_ID_FV, doc_event_category_ids, feature_vector)
        set_feature_vector_cat_values(DOC_EVENT_TOPIC_ID_FV, doc_event_topic_ids, feature_vector)
        set_feature_vector_cat_values(DOC_EVENT_ENTITY_ID_FV, doc_event_entity_ids, feature_vector)
        
        #Creating dummy column as the last column because xgboost have a problem if the last column is undefined for all rows, 
        #saying that dimentions of data and feature_names do not match
        #feature_vector[feature_vector_labels_dict[DUMMY_FEATURE_COLUMN]] = float(0)
            
        #Ensuring that all elements are floats for compatibility with UDF output (ArrayType(FloatType()))
        #feature_vector = list([float(x) for x in feature_vector])
        
    except Exception as e:
        raise Exception("[get_ad_feature_vector] ERROR PROCESSING FEATURE VECTOR! Params: {}" \
                        .format([user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
                                 event_country, event_country_state,
                            ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
                            geo_location_event, 
                            doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
                            traffic_source_pv, advertiser_id, publisher_id,
                            campaign_id, document_id_event,
                            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                            doc_ad_topic_ids, doc_ad_top_confidence_levels,
                            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                            doc_event_category_ids, doc_event_cat_confidence_levels,
                            doc_event_topic_ids, doc_event_top_confidence_levels,
                            doc_event_entity_ids, doc_event_ent_confidence_levels]),
                        e)
    
    return SparseVector(len(feature_vector_labels_dict), feature_vector)

In [183]:
get_ad_feature_vector_udf = F.udf(lambda user_doc_ids_viewed, user_views_count, user_categories, user_topics, 
                                        user_entities, event_country, event_country_state, ad_id, document_id, source_id, 
                                        doc_ad_publish_time, timestamp_event, platform_event,
                                        geo_location_event, 
                                        doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
                                        traffic_source_pv, advertiser_id, publisher_id,
                                        campaign_id, document_id_event,
                                        category_ids_by_doc, cat_confidence_level_by_doc, 
                                        topic_ids_by_doc, top_confidence_level_by_doc,
                                        entity_ids_by_doc, ent_confidence_level_by_doc,
                                        doc_event_category_id_list, doc_event_confidence_level_cat_list,
                                        doc_event_topic_id_list, doc_event_confidence_level_top,
                                        doc_event_entity_id_list, doc_event_confidence_level_ent: \
                                         get_ad_feature_vector(user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
                                                            event_country, event_country_state, 
                                                            ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
                                                            geo_location_event, 
                                                            doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,   
                                                            traffic_source_pv, advertiser_id, publisher_id,
                                                            campaign_id, document_id_event,
                                                            category_ids_by_doc, cat_confidence_level_by_doc, 
                                                            topic_ids_by_doc, top_confidence_level_by_doc,
                                                            entity_ids_by_doc, ent_confidence_level_by_doc,
                                                            doc_event_category_id_list, doc_event_confidence_level_cat_list,
                                                            doc_event_topic_id_list, doc_event_confidence_level_top,
                                                            doc_event_entity_id_list, doc_event_confidence_level_ent),    
                            VectorUDT())

## Building feature vectors

In [184]:
def set_feature_vector_cat_value_integral(field_name, field_value, feature_vector):
    if not is_null(field_value): #and str(field_value) != '-1':
        feature_vector[feature_vector_labels_integral_dict[field_name]] = float(field_value)
        
def set_feature_vector_cat_top_multi_values_integral(field_name, values, confidences, feature_vector, top=5):
    top_values = list(filter(lambda z: z != -1, map(lambda y: y[0], sorted(zip(values, confidences), key=lambda x: -x[1]))))[:top]
    for idx, field_value in list(enumerate(top_values)):
        set_feature_vector_cat_value_integral('{}_{}'.format(field_name, idx+1), field_value, feature_vector)

In [185]:
def get_ad_feature_vector_integral(user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
                            event_country, event_country_state,
                            ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
                            geo_location_event, 
                            doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
                            traffic_source_pv, advertiser_id, publisher_id,
                            campaign_id, document_id_event,
                            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                            doc_ad_topic_ids, doc_ad_top_confidence_levels,
                            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                            doc_event_category_ids, doc_event_cat_confidence_levels,
                            doc_event_topic_ids, doc_event_top_confidence_levels,
                            doc_event_entity_ids, doc_event_ent_confidence_levels):
             
    try:

        feature_vector = {}
        
        if user_views_count != None:
            feature_vector[feature_vector_labels_integral_dict['user_views']] = float(user_views_count)
         
        if user_doc_ids_viewed != None:
            feature_vector[feature_vector_labels_integral_dict['user_has_already_viewed_doc']] = float(document_id in user_doc_ids_viewed)               
          
        if ad_id in ad_id_popularity_broad.value:            
            feature_vector[feature_vector_labels_integral_dict['ad_views']] = float(ad_id_popularity_broad.value[ad_id][1])
        
        if document_id in document_id_popularity_broad.value:
            feature_vector[feature_vector_labels_integral_dict['doc_views']] = float(document_id_popularity_broad.value[document_id][1])            
            
        if timestamp_event > -1:
            dt_timestamp_event = convert_odd_timestamp(timestamp_event)
            if doc_ad_publish_time != None:
                delta_days = (dt_timestamp_event - doc_ad_publish_time).days
                if delta_days >= 0 and delta_days <= 365*10: #10 years
                    feature_vector[feature_vector_labels_integral_dict['doc_ad_days_since_published']] = float(delta_days)
                        
            if doc_event_publish_time != None:
                delta_days = (dt_timestamp_event - doc_event_publish_time).days
                if delta_days >= 0 and delta_days <= 365*10: #10 years
                    feature_vector[feature_vector_labels_integral_dict['doc_event_days_since_published']] = float(delta_days)
                    
            
            #Local period of the day (hours)
            dt_local_timestamp_event = get_local_datetime(dt_timestamp_event, event_country, event_country_state)    
            local_hour_bin = get_hour_bin(dt_local_timestamp_event.hour)            
            feature_vector[feature_vector_labels_integral_dict['doc_event_hour']] = float(local_hour_bin) #Hour for Decision Trees
            set_feature_vector_cat_value_integral(EVENT_HOUR_FV, local_hour_bin, feature_vector) #Period of day for FFM
            
            #Weekend
            weekend = int(is_weekend(dt_local_timestamp_event))
            feature_vector[feature_vector_labels_integral_dict['event_weekend']] = float(weekend)               
                                        
        
        conf_field_suffix = '_conf'
        conf_multiplied_field_suffix = '_conf_multipl'
        
        #Setting Popularity fields
        pop_scores = get_popularity_score(event_country, ad_id, document_id, source_id, 
                                publisher_id, advertiser_id, campaign_id, document_id_event,
                                doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                                doc_ad_topic_ids, doc_ad_top_confidence_levels,
                                doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                                output_detailed_list=True)
        
                                

        for score in pop_scores:
            feature_vector[feature_vector_labels_integral_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]

        #Setting User-Doc_ad CB Similarity fields
        user_doc_ad_cb_sim_scores = get_user_cb_interest_score(user_views_count, user_categories, user_topics, user_entities, 
                                timestamp_event, 
                                 doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                                 doc_ad_topic_ids, doc_ad_top_confidence_levels,
                                 doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                                output_detailed_list=True)

        for score in user_doc_ad_cb_sim_scores:
            feature_vector[feature_vector_labels_integral_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]
            
        #Setting Doc_event-doc_ad CB Similarity fields
        doc_event_doc_ad_cb_sim_scores = get_doc_event_doc_ad_cb_similarity_score(
                                            doc_event_category_ids, doc_event_cat_confidence_levels,
                                            doc_event_topic_ids, doc_event_top_confidence_levels,
                                            doc_event_entity_ids, doc_event_ent_confidence_levels,
                                            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                                            doc_ad_topic_ids, doc_ad_top_confidence_levels,
                                            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                                        output_detailed_list=True)
        
        for score in doc_event_doc_ad_cb_sim_scores:
            feature_vector[feature_vector_labels_integral_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]
        
        #Process code for event_country
        if event_country in event_country_values_counts:
            event_country_code = event_country_values_counts[event_country]
        else:
            event_country_code = event_country_values_counts[LESS_SPECIAL_CAT_VALUE]                        
        set_feature_vector_cat_value_integral(EVENT_COUNTRY_FV, event_country_code, feature_vector)
        
        #Process code for event_country_state
        if event_country_state in event_country_state_values_counts:
            event_country_state_code = event_country_state_values_counts[event_country_state]
        else:
            event_country_state_code = event_country_state_values_counts[LESS_SPECIAL_CAT_VALUE]         
        set_feature_vector_cat_value_integral(EVENT_COUNTRY_STATE_FV, event_country_state_code, feature_vector)
                
        #Process code for geo_location_event
        if geo_location_event in event_geo_location_values_counts:
            geo_location_event_code = event_geo_location_values_counts[geo_location_event]
        else:
            geo_location_event_code = event_geo_location_values_counts[LESS_SPECIAL_CAT_VALUE]
        set_feature_vector_cat_value_integral(EVENT_GEO_LOCATION_FV, geo_location_event_code, feature_vector)   
         
        set_feature_vector_cat_value_integral(TRAFFIC_SOURCE_FV, traffic_source_pv, feature_vector)        
        set_feature_vector_cat_value_integral(EVENT_PLATFORM_FV, platform_event, feature_vector)
        set_feature_vector_cat_value_integral(AD_ADVERTISER_FV, advertiser_id, feature_vector)
        set_feature_vector_cat_value_integral(DOC_AD_SOURCE_ID_FV, source_id, feature_vector)
        set_feature_vector_cat_value_integral(DOC_AD_PUBLISHER_ID_FV, publisher_id, feature_vector)
        set_feature_vector_cat_value_integral(DOC_EVENT_SOURCE_ID_FV, doc_event_source_id, feature_vector)
        set_feature_vector_cat_value_integral(DOC_EVENT_PUBLISHER_ID_FV, doc_event_publisher_id, feature_vector)
                
        set_feature_vector_cat_top_multi_values_integral(DOC_AD_CATEGORY_ID_FV, doc_ad_category_ids, doc_ad_cat_confidence_levels, feature_vector, top=3)
        set_feature_vector_cat_top_multi_values_integral(DOC_AD_TOPIC_ID_FV, doc_ad_topic_ids, doc_ad_top_confidence_levels, feature_vector, top=3)
        
        set_feature_vector_cat_top_multi_values_integral(DOC_EVENT_CATEGORY_ID_FV, doc_event_category_ids, doc_event_cat_confidence_levels, feature_vector, top=3)
        set_feature_vector_cat_top_multi_values_integral(DOC_EVENT_TOPIC_ID_FV, doc_event_topic_ids, doc_event_top_confidence_levels, feature_vector, top=3)                           
        
        #Process codes for doc_ad_entity_ids
        doc_ad_entity_ids_codes = [doc_entity_id_values_counts[x] if x in doc_entity_id_values_counts 
                                   else doc_entity_id_values_counts[LESS_SPECIAL_CAT_VALUE] 
                                   for x in doc_ad_entity_ids]
        set_feature_vector_cat_top_multi_values_integral(DOC_AD_ENTITY_ID_FV, doc_ad_entity_ids_codes, doc_ad_ent_confidence_levels, feature_vector, top=6)
        
        
        #Process codes for doc_event_entity_ids
        doc_event_entity_ids_codes = [doc_entity_id_values_counts[x] if x in doc_entity_id_values_counts 
                                   else doc_entity_id_values_counts[LESS_SPECIAL_CAT_VALUE] 
                                   for x in doc_event_entity_ids]
        set_feature_vector_cat_top_multi_values_integral(DOC_EVENT_ENTITY_ID_FV, doc_event_entity_ids_codes, doc_event_ent_confidence_levels, feature_vector, top=6)
        
        #Creating dummy column as the last column because xgboost have a problem if the last column is undefined for all rows, 
        #saying that dimentions of data and feature_names do not match
        #feature_vector[feature_vector_labels_dict[DUMMY_FEATURE_COLUMN]] = float(0)
            
        #Ensuring that all elements are floats for compatibility with UDF output (ArrayType(FloatType()))
        #feature_vector = list([float(x) for x in feature_vector])
        
    except Exception as e:
        raise Exception("[get_ad_feature_vector_integral] ERROR PROCESSING FEATURE VECTOR! Params: {}" \
                        .format([user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
                                 event_country, event_country_state,
                            ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
                            geo_location_event, 
                            doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
                            traffic_source_pv, advertiser_id, publisher_id,
                            campaign_id, document_id_event,
                            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                            doc_ad_topic_ids, doc_ad_top_confidence_levels,
                            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                            doc_event_category_ids, doc_event_cat_confidence_levels,
                            doc_event_topic_ids, doc_event_top_confidence_levels,
                            doc_event_entity_ids, doc_event_ent_confidence_levels]),
                        e)
    
    return SparseVector(len(feature_vector_labels_integral_dict), feature_vector)

In [186]:
get_ad_feature_vector_integral_udf = F.udf(lambda user_doc_ids_viewed, user_views_count, user_categories, user_topics, 
                                        user_entities, event_country, event_country_state, ad_id, document_id, source_id, 
                                        doc_ad_publish_time, timestamp_event, platform_event,
                                        geo_location_event, 
                                        doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
                                        traffic_source_pv, advertiser_id, publisher_id,
                                        campaign_id, document_id_event,
                                        category_ids_by_doc, cat_confidence_level_by_doc, 
                                        topic_ids_by_doc, top_confidence_level_by_doc,
                                        entity_ids_by_doc, ent_confidence_level_by_doc,
                                        doc_event_category_id_list, doc_event_confidence_level_cat_list,
                                        doc_event_topic_id_list, doc_event_confidence_level_top,
                                        doc_event_entity_id_list, doc_event_confidence_level_ent: \
                                         get_ad_feature_vector_integral(user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
                                                            event_country, event_country_state, 
                                                            ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
                                                            geo_location_event, 
                                                            doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,   
                                                            traffic_source_pv, advertiser_id, publisher_id,
                                                            campaign_id, document_id_event,
                                                            category_ids_by_doc, cat_confidence_level_by_doc, 
                                                            topic_ids_by_doc, top_confidence_level_by_doc,
                                                            entity_ids_by_doc, ent_confidence_level_by_doc,
                                                            doc_event_category_id_list, doc_event_confidence_level_cat_list,
                                                            doc_event_topic_id_list, doc_event_confidence_level_top,
                                                            doc_event_entity_id_list, doc_event_confidence_level_ent),    
                            VectorUDT())
                             #StructField("features", VectorUDT()))
                             #MapType(IntegerType(), FloatType()))

## Export Train set feature vectors

In [187]:
train_set_enriched_df = train_set_df \
                             .join(documents_categories_grouped_df, on=F.col("document_id_promo") == F.col("documents_categories_grouped.document_id_cat"), how='left') \
                             .join(documents_topics_grouped_df, on=F.col("document_id_promo") == F.col("documents_topics_grouped.document_id_top"), how='left') \
                             .join(documents_entities_grouped_df, on=F.col("document_id_promo") == F.col("documents_entities_grouped.document_id_ent"), how='left') \
                             .join(documents_categories_grouped_df \
                                       .withColumnRenamed('category_id_list', 'doc_event_category_id_list')
                                       .withColumnRenamed('confidence_level_cat_list', 'doc_event_confidence_level_cat_list') \
                                       .alias('documents_event_categories_grouped'), 
                                   on=F.col("document_id_event") == F.col("documents_event_categories_grouped.document_id_cat"), 
                                   how='left') \
                             .join(documents_topics_grouped_df \
                                       .withColumnRenamed('topic_id_list', 'doc_event_topic_id_list')
                                       .withColumnRenamed('confidence_level_top_list', 'doc_event_confidence_level_top_list') \
                                       .alias('documents_event_topics_grouped'), 
                                   on=F.col("document_id_event") == F.col("documents_event_topics_grouped.document_id_top"), 
                                   how='left') \
                             .join(documents_entities_grouped_df \
                                       .withColumnRenamed('entity_id_list', 'doc_event_entity_id_list')
                                       .withColumnRenamed('confidence_level_ent_list', 'doc_event_confidence_level_ent_list') \
                                       .alias('documents_event_entities_grouped'), 
                                   on=F.col("document_id_event") == F.col("documents_event_entities_grouped.document_id_ent"), 
                                   how='left') \
                            .select('display_id','uuid_event','event_country','event_country_state','platform_event',
                                    'source_id_doc_event', 'publisher_doc_event','publish_time_doc_event',
                                            'publish_time', 'ad_id','document_id_promo','clicked',   
                                           'geo_location_event', 'advertiser_id', 'publisher_id',
                                            'campaign_id', 'document_id_event',
                                            'traffic_source_pv',                                          
                                        int_list_null_to_empty_list_udf('doc_event_category_id_list').alias('doc_event_category_id_list'),
                                        float_list_null_to_empty_list_udf('doc_event_confidence_level_cat_list').alias('doc_event_confidence_level_cat_list'),
                                        int_list_null_to_empty_list_udf('doc_event_topic_id_list').alias('doc_event_topic_id_list'),
                                        float_list_null_to_empty_list_udf('doc_event_confidence_level_top_list').alias('doc_event_confidence_level_top_list'),
                                        str_list_null_to_empty_list_udf('doc_event_entity_id_list').alias('doc_event_entity_id_list'),
                                        float_list_null_to_empty_list_udf('doc_event_confidence_level_ent_list').alias('doc_event_confidence_level_ent_list'),
                                       int_null_to_minus_one_udf('source_id').alias('source_id'),                                      
                                       int_null_to_minus_one_udf('timestamp_event').alias('timestamp_event'),
                                       int_list_null_to_empty_list_udf('category_id_list').alias('category_id_list'), 
                                       float_list_null_to_empty_list_udf('confidence_level_cat_list').alias('confidence_level_cat_list'), 
                                       int_list_null_to_empty_list_udf('topic_id_list').alias('topic_id_list'), 
                                       float_list_null_to_empty_list_udf('confidence_level_top_list').alias('confidence_level_top_list'), 
                                       str_list_null_to_empty_list_udf('entity_id_list').alias('entity_id_list'), 
                                       float_list_null_to_empty_list_udf('confidence_level_ent_list').alias('confidence_level_ent_list')                                                       
                                      ) \
                            .join(user_profiles_df, on=[F.col("user_profiles.uuid") == F.col("uuid_event")], how='left') \
                            .withColumnRenamed('categories', 'user_categories') \
                            .withColumnRenamed('topics', 'user_topics') \
                            .withColumnRenamed('entities', 'user_entities') \
                            .withColumnRenamed('doc_ids', 'user_doc_ids_viewed') \
                            .withColumnRenamed('views', 'user_views_count')

In [188]:
train_set_feature_vectors_df = train_set_enriched_df \
                                .withColumn('feature_vector', 
                                            #get_ad_feature_vector_udf(
                                            get_ad_feature_vector_integral_udf(
                                                                'user_doc_ids_viewed',
                                                                'user_views_count',
                                                                'user_categories', 
                                                                'user_topics', 
                                                                'user_entities', 
                                                                'event_country', 
                                                                'event_country_state',
                                                                'ad_id', 
                                                                'document_id_promo', 
                                                                'source_id', 
                                                                'publish_time', 
                                                                'timestamp_event', 
                                                                'platform_event',
                                                                'geo_location_event', 
                                                                'source_id_doc_event', 
                                                                'publisher_doc_event',
                                                                'publish_time_doc_event',
                                                                'traffic_source_pv',
                                                                'advertiser_id', 
                                                                'publisher_id',
                                                                'campaign_id',
                                                                'document_id_event',
                                                                'category_id_list', 
                                                                'confidence_level_cat_list', 
                                                                'topic_id_list', 
                                                                'confidence_level_top_list',
                                                                'entity_id_list', 
                                                                'confidence_level_ent_list',
                                                                'doc_event_category_id_list',
                                                                'doc_event_confidence_level_cat_list',
                                                                'doc_event_topic_id_list',
                                                                'doc_event_confidence_level_top_list',
                                                                'doc_event_entity_id_list',
                                                                'doc_event_confidence_level_ent_list')) \
                            .select(F.col('uuid_event').alias('uuid'),
                                    'display_id',
                                    'ad_id',
                                    'document_id_event',
                                    F.col('document_id_promo').alias('document_id'),
                                    F.col('clicked').alias('label'),
                                    'feature_vector') #\
                            #.orderBy('display_id','ad_id')

In [189]:
if evaluation:
    train_feature_vector_gcs_folder_name = 'train_feature_vectors_integral_eval'
else:
    train_feature_vector_gcs_folder_name = 'train_feature_vectors_integral'

In [148]:
%time train_set_feature_vectors_df.write.parquet(OUTPUT_BUCKET_FOLDER+train_feature_vector_gcs_folder_name, mode='overwrite')

CPU times: user 404 ms, sys: 136 ms, total: 540 ms
Wall time: 47min 54s


## Exporting integral feature vectors to CSV

In [190]:
train_feature_vectors_exported_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER+train_feature_vector_gcs_folder_name)
# train_feature_vectors_exported_df.take(3)

In [191]:
if evaluation:
    train_feature_vector_integral_csv_folder_name = 'train_feature_vectors_integral_eval.csv'
else:
    train_feature_vector_integral_csv_folder_name = 'train_feature_vectors_integral.csv'

In [192]:
integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'is_leak'] + feature_vector_labels_integral
    
with open(train_feature_vector_integral_csv_folder_name+".header", 'w') as output:
    output.writelines('\n'.join(integral_headers))

In [193]:
def sparse_vector_to_csv_with_nulls_row(additional_column_values, vec, num_columns):    
    return ','.join([str(value) for value in additional_column_values] + 
                     list([ '{:.5}'.format(vec[x]) if x in vec.indices else '' for x in range(vec.size) ])[:num_columns]) \
            .replace('.0,',',')

In [194]:
train_feature_vectors_integral_csv_rdd = train_feature_vectors_exported_df.select(
     'label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'feature_vector').withColumn('is_leak', F.lit(-1)) \
     .rdd.map(lambda x: sparse_vector_to_csv_with_nulls_row([x['label'], x['display_id'], x['ad_id'], x['document_id'], x['document_id_event'], x['is_leak']], 
                                                  x['feature_vector'], len(integral_headers)))

In [55]:
%time train_feature_vectors_integral_csv_rdd.saveAsTextFile(OUTPUT_BUCKET_FOLDER+train_feature_vector_integral_csv_folder_name)

CPU times: user 412 ms, sys: 212 ms, total: 624 ms
Wall time: 57min 38s


## Export Validation/Test set feature vectors

In [195]:
def is_leak(max_timestamp_pv_leak, timestamp_event):
    return max_timestamp_pv_leak >= 0 and max_timestamp_pv_leak >= timestamp_event

In [196]:
is_leak_udf = F.udf(lambda max_timestamp_pv_leak, timestamp_event: int(is_leak(max_timestamp_pv_leak, timestamp_event)), IntegerType())

In [197]:
if evaluation:
    data_df = validation_set_df
else:
    data_df = test_set_df


test_validation_set_enriched_df = data_df.select('display_id','uuid_event','event_country','event_country_state','platform_event',
                                            'source_id_doc_event', 'publisher_doc_event','publish_time_doc_event',     
                                            'publish_time',
                                           'ad_id','document_id_promo','clicked',  
                                           'geo_location_event', 'advertiser_id', 'publisher_id',
                                           'campaign_id', 'document_id_event',
                                           'traffic_source_pv',                                           
                                        int_list_null_to_empty_list_udf('doc_event_category_id_list').alias('doc_event_category_id_list'),
                                        float_list_null_to_empty_list_udf('doc_event_confidence_level_cat_list').alias('doc_event_confidence_level_cat_list'),
                                        int_list_null_to_empty_list_udf('doc_event_topic_id_list').alias('doc_event_topic_id_list'),
                                        float_list_null_to_empty_list_udf('doc_event_confidence_level_top_list').alias('doc_event_confidence_level_top_list'),
                                        str_list_null_to_empty_list_udf('doc_event_entity_id_list').alias('doc_event_entity_id_list'),
                                        float_list_null_to_empty_list_udf('doc_event_confidence_level_ent_list').alias('doc_event_confidence_level_ent_list'),
                                       int_null_to_minus_one_udf('source_id').alias('source_id'),                                   
                                       int_null_to_minus_one_udf('timestamp_event').alias('timestamp_event'),
                                       int_list_null_to_empty_list_udf('category_id_list').alias('category_id_list'), 
                                       float_list_null_to_empty_list_udf('confidence_level_cat_list').alias('confidence_level_cat_list'), 
                                       int_list_null_to_empty_list_udf('topic_id_list').alias('topic_id_list'), 
                                       float_list_null_to_empty_list_udf('confidence_level_top_list').alias('confidence_level_top_list'), 
                                       str_list_null_to_empty_list_udf('entity_id_list').alias('entity_id_list'), 
                                       float_list_null_to_empty_list_udf('confidence_level_ent_list').alias('confidence_level_ent_list'),
                                       int_null_to_minus_one_udf('max_timestamp_pv').alias('max_timestamp_pv_leak')
                                      ) \
                            .join(user_profiles_df, on=[F.col("user_profiles.uuid") == F.col("uuid_event")], how='left') \
                            .withColumnRenamed('categories', 'user_categories') \
                            .withColumnRenamed('topics', 'user_topics') \
                            .withColumnRenamed('entities', 'user_entities') \
                            .withColumnRenamed('doc_ids', 'user_doc_ids_viewed') \
                            .withColumnRenamed('views', 'user_views_count')

In [198]:
test_validation_set_feature_vectors_df = test_validation_set_enriched_df \
                                .withColumn('feature_vector', 
                                            #get_ad_feature_vector_udf(
                                            get_ad_feature_vector_integral_udf(
                                                                'user_doc_ids_viewed', 
                                                                'user_views_count',
                                                                'user_categories', 
                                                                'user_topics', 
                                                                'user_entities', 
                                                                'event_country', 
                                                                'event_country_state',
                                                                'ad_id', 
                                                                'document_id_promo', 
                                                                'source_id', 
                                                                'publish_time', 
                                                                'timestamp_event', 
                                                                'platform_event',
                                                                'geo_location_event', 
                                                                'source_id_doc_event', 
                                                                'publisher_doc_event',
                                                                'publish_time_doc_event',
                                                                'traffic_source_pv',
                                                                'advertiser_id', 
                                                                'publisher_id',
                                                                'campaign_id',
                                                                'document_id_event',
                                                                'category_id_list', 
                                                                'confidence_level_cat_list', 
                                                                'topic_id_list', 
                                                                'confidence_level_top_list',
                                                                'entity_id_list', 
                                                                'confidence_level_ent_list',
                                                                'doc_event_category_id_list',
                                                                'doc_event_confidence_level_cat_list',
                                                                'doc_event_topic_id_list',
                                                                'doc_event_confidence_level_top_list',
                                                                'doc_event_entity_id_list',
                                                                'doc_event_confidence_level_ent_list')) \
                            .select(F.col('uuid').alias('uuid'),                                    
                                    'display_id',
                                    'ad_id',
                                    'document_id_event',
                                    F.col('document_id_promo').alias('document_id'),
                                    F.col('clicked').alias('label'),
                                    is_leak_udf('max_timestamp_pv_leak','timestamp_event').alias('is_leak'),
                                    'feature_vector') #\
                            #.orderBy('display_id','ad_id')

In [199]:
if evaluation:
    test_validation_feature_vector_gcs_folder_name = 'validation_feature_vectors_integral'
else:
    test_validation_feature_vector_gcs_folder_name = 'test_feature_vectors_integral'

In [351]:
%time test_validation_set_feature_vectors_df.write.parquet(OUTPUT_BUCKET_FOLDER+test_validation_feature_vector_gcs_folder_name, mode='overwrite')

CPU times: user 396 ms, sys: 220 ms, total: 616 ms
Wall time: 46min 18s


## Exporting integral feature vectors to CSV

In [200]:
test_validation_feature_vectors_exported_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER+test_validation_feature_vector_gcs_folder_name)
# test_validation_feature_vectors_exported_df.take(3)

In [201]:
if evaluation:
    test_validation_feature_vector_integral_csv_folder_name = 'validation_feature_vectors_integral.csv'
else:
    test_validation_feature_vector_integral_csv_folder_name = 'test_feature_vectors_integral.csv'

In [202]:
integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'is_leak'] + feature_vector_labels_integral
    
with open(test_validation_feature_vector_integral_csv_folder_name+".header", 'w') as output:
    output.writelines('\n'.join(integral_headers))

In [203]:
test_validation_feature_vectors_integral_csv_rdd = test_validation_feature_vectors_exported_df.select(
     'label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'is_leak', 'feature_vector') \
     .rdd.map(lambda x: sparse_vector_to_csv_with_nulls_row([x['label'], x['display_id'], x['ad_id'], x['document_id'], x['document_id_event'], x['is_leak']], 
                                                  x['feature_vector'], len(integral_headers)))

In [205]:
%time test_validation_feature_vectors_integral_csv_rdd.saveAsTextFile(OUTPUT_BUCKET_FOLDER+test_validation_feature_vector_integral_csv_folder_name)

CPU times: user 228 ms, sys: 92 ms, total: 320 ms
Wall time: 29min 11s


In [208]:
print(test_validation_feature_vectors_integral_csv_rdd.take(1))

['1,3110701,149004,1335842,1973614,0,0,0,43,5042,1.5296e+04,0,3,54,0.25823,0.5133,0.13255,0.22882,0.53838,0.12319,0.22882,0.53838,0.12319,0.18547,0.34324,0.063661,0.23782,0.33201,0.078959,,,,0.22882,0.53838,0.12319,0.22882,0.53838,0.12319,0.22881,0.4539,0.10386,0.22881,0.4539,0.10386,0.19476,0.017983,0.0035023,0.2483,0.016695,0.0041453,0.18375,0.14213,0.026118,0.18861,0.14897,0.028097,0.064491,0.99998,0.06449,0,0.00047778,0,0,5.1186e-11,0,0.013609,0.99957,0.013603,0,1.1111e-05,0,0,1.1374e-12,0,709,1100,1408,,277,,,,,,,,,509,3890,1403,1408,,136,,,,,,,,,407,6715,1.8595e+07,1.6453e+06,4.2814e+05,3,2,1.0']


In [211]:
print(test_validation_feature_vectors_integral_csv_rdd.first())

1,3110701,149004,1335842,1973614,0,0,0,43,5042,1.5296e+04,0,3,54,0.25823,0.5133,0.13255,0.22882,0.53838,0.12319,0.22882,0.53838,0.12319,0.18547,0.34324,0.063661,0.23782,0.33201,0.078959,,,,0.22882,0.53838,0.12319,0.22882,0.53838,0.12319,0.22881,0.4539,0.10386,0.22881,0.4539,0.10386,0.19476,0.017983,0.0035023,0.2483,0.016695,0.0041453,0.18375,0.14213,0.026118,0.18861,0.14897,0.028097,0.064491,0.99998,0.06449,0,0.00047778,0,0,5.1186e-11,0,0.013609,0.99957,0.013603,0,1.1111e-05,0,0,1.1374e-12,0,709,1100,1408,,277,,,,,,,,,509,3890,1403,1408,,136,,,,,,,,,407,6715,1.8595e+07,1.6453e+06,4.2814e+05,3,2,1.0
