In [1]:
# Dana Rozenblum & Efrat Magidov

# Checklist:
# AWS emr-5.29.0
# MASTER r5d.8xlarge 1x, no EBS
# CORE r5d.8xlarge 4x, no EBS
# Custom bootstrap action: s3://ydatazian/bootstrap.sh
# Allow ssh in master node security group

In [1]:
import tqdm.notebook as tqdm
import numpy as np
import scipy
import sklearn
import matplotlib.pyplot as plt

# SparkSession

https://spark.apache.org/docs/2.4.4/api/python/pyspark.html

https://spark.apache.org/docs/2.4.4/api/python/pyspark.sql.html

In [2]:
import findspark
findspark.init()

import spark_utils
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc = SparkContext("yarn", "My App", conf=spark_utils.get_spark_conf())
se = SparkSession(sc)
spark_utils.print_ui_links()

NameNode: http://ec2-3-236-176-16.compute-1.amazonaws.com:50070
YARN: http://ec2-3-236-176-16.compute-1.amazonaws.com:8088
Spark UI: http://ec2-3-236-176-16.compute-1.amazonaws.com:20888/proxy/application_1624340242866_0001


# Register all tables for sql queries

In [3]:
from IPython.display import display
tables = ["clicks_test", "clicks_train", 
          "documents_categories", "documents_entities", "documents_meta", "documents_topics", 
          "events", "page_views", "page_views_sample", "promoted_content"]
for name in tqdm.tqdm(tables):
    df = se.read.parquet("s3://ydatazian/{}.parquet".format(name))
    df.registerTempTable(name)
    print(name)
    display(df.limit(3).toPandas())

  0%|          | 0/10 [00:00<?, ?it/s]

clicks_test


Unnamed: 0,display_id,ad_id
0,16874594,66758
1,16874594,150083
2,16874594,162754


clicks_train


Unnamed: 0,display_id,ad_id,clicked
0,1,42337,0
1,1,139684,0
2,1,144739,1


documents_categories


Unnamed: 0,document_id,category_id,confidence_level
0,1595802,1611,0.92
1,1595802,1610,0.07
2,1524246,1807,0.92


documents_entities


Unnamed: 0,document_id,entity_id,confidence_level
0,1524246,f9eec25663db4cd83183f5c805186f16,0.672865314504701
1,1524246,55ebcfbdaff1d6f60b3907151f38527a,0.399113728441297
2,1524246,839907a972930b17b125eb0247898412,0.392095749652966


documents_meta


Unnamed: 0,document_id,source_id,publisher_id,publish_time
0,1595802,1,603,2016-06-05 00:00:00
1,1524246,1,603,2016-05-26 11:00:00
2,1617787,1,603,2016-05-27 00:00:00


documents_topics


Unnamed: 0,document_id,topic_id,confidence_level
0,1595802,140,0.0731131601068925
1,1595802,16,0.0594164867373976
2,1595802,143,0.0454207537554526


events


Unnamed: 0,display_id,uuid,document_id,timestamp,platform,geo_location
0,1,cb8c55702adb93,379743,61,3,US>SC>519
1,2,79a85fa78311b9,1794259,81,2,US>CA>807
2,3,822932ce3d8757,1179111,182,2,US>MI>505


page_views


Unnamed: 0,uuid,document_id,timestamp,platform,geo_location,traffic_source
0,1fd5f051fba643,120,31905835,1,RS,2
1,8557aa9004be3b,120,32053104,1,VN>44,2
2,c351b277a358f0,120,54013023,1,KR>12,1


page_views_sample


Unnamed: 0,uuid,document_id,timestamp,platform,geo_location,traffic_source
0,1fd5f051fba643,120,31905835,1,RS,2
1,8557aa9004be3b,120,32053104,1,VN>44,2
2,c351b277a358f0,120,54013023,1,KR>12,1


promoted_content


Unnamed: 0,ad_id,document_id,campaign_id,advertiser_id
0,1,6614,1,7
1,2,471467,2,7
2,3,7692,3,7


# Prepare dataset for VW

We will predict a *click* based on:
- ad_id
- document_id
- campaign_id
- advertiser_id

In [4]:
%%time
se.sql("""
select 
    clicks_train.clicked,
    clicks_train.display_id,
    clicks_train.ad_id,
    promoted_content.document_id,
    promoted_content.campaign_id,
    promoted_content.advertiser_id
from clicks_train join promoted_content on clicks_train.ad_id = promoted_content.ad_id
""").write.parquet("/train_features.parquet", mode='overwrite')

CPU times: user 8.34 ms, sys: 1.08 ms, total: 9.42 ms
Wall time: 54.2 s


In [5]:
se.read.parquet("/train_features.parquet").show(5)

+-------+----------+------+-----------+-----------+-------------+
|clicked|display_id| ad_id|document_id|campaign_id|advertiser_id|
+-------+----------+------+-----------+-----------+-------------+
|      0|         1| 42337|     938164|       5969|         1499|
|      0|         1|139684|    1085937|      17527|         2563|
|      1|         1|144739|    1337362|      18488|         2909|
|      0|         1|156824|     992370|       7283|         1919|
|      0|         1|279295|    1670176|      27524|         1820|
+-------+----------+------+-----------+-----------+-------------+
only showing top 5 rows



In [6]:
# Format: [Label] [Importance] [Base] [Tag]|Namespace Features |Namespace Features ... |Namespace Features
# https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Input-format
def vw_row_mapper(row):
    clicked = None
    features = []
    for k, v in row.asDict().items():
        if k == 'clicked':
            clicked = '1' if v == '1' else '-1'
        else:
            features.append(k + "_" + v)
    tag = row.display_id + "_" + row.ad_id
    return "{} {}| {}".format(clicked, tag, " ".join(features))

r = se.read.parquet("/train_features.parquet").take(1)[0]
print(r)
print(vw_row_mapper(r))

Row(clicked='0', display_id='1', ad_id='42337', document_id='938164', campaign_id='5969', advertiser_id='1499')
-1 1_42337| display_id_1 ad_id_42337 document_id_938164 campaign_id_5969 advertiser_id_1499


In [7]:
%%time
! hdfs dfs -rm -r /train_features.txt
(
    se.read.parquet("/train_features.parquet")
    .rdd
    .map(vw_row_mapper)
    .saveAsTextFile("/train_features.txt")
)

rm: `/train_features.txt': No such file or directory
CPU times: user 64.4 ms, sys: 25 ms, total: 89.4 ms
Wall time: 3min 39s


In [8]:
# copy file to local master node
! rm /mnt/train.txt
! hdfs dfs -getmerge /train_features.txt /mnt/train.txt
# preview local file
! head -n 5 /mnt/train.txt

rm: cannot remove ‘/mnt/train.txt’: No such file or directory
-1 1_42337| display_id_1 ad_id_42337 document_id_938164 campaign_id_5969 advertiser_id_1499
-1 1_139684| display_id_1 ad_id_139684 document_id_1085937 campaign_id_17527 advertiser_id_2563
1 1_144739| display_id_1 ad_id_144739 document_id_1337362 campaign_id_18488 advertiser_id_2909
-1 1_156824| display_id_1 ad_id_156824 document_id_992370 campaign_id_7283 advertiser_id_1919
-1 1_279295| display_id_1 ad_id_279295 document_id_1670176 campaign_id_27524 advertiser_id_1820


# Train VW
https://vowpalwabbit.org/tutorials/getting_started.html

https://github.com/JohnLangford/vowpal_wabbit/wiki/Command-line-arguments

In [9]:
! ./vw -d /mnt/train.txt -b 24 -c -k --ftrl --passes 1 -f model --holdout_off --loss_function logistic --random_seed 42 --progress 8000000

final_regressor = model
Enabling FTRL based optimization
Algorithm used: Proximal-FTRL
ftrl_alpha = 0.005
ftrl_beta = 0.1
Num weight bits = 24
learning rate = 0.5
initial_t = 0
power_t = 0.5
creating cache_file = /mnt/train.txt.cache
Reading datafile = /mnt/train.txt
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.460142 0.460142      8000000      8000000.0  -1.0000  -1.2983        6
0.456159 0.452175     16000000     16000000.0  -1.0000  -1.4733        6
0.453326 0.447661     24000000     24000000.0  -1.0000  -2.3174        6
0.452628 0.450533     32000000     32000000.0   1.0000  -0.5719        6
0.452887 0.453924     40000000     40000000.0  -1.0000  -1.2648        6
0.452010 0.447625     48000000     48000000.0  -1.0000  -2.6227        6
0.451328 0.447232     56000000     56000000.0  -1.0000  -3.4451        6
0.450649 0.445902     64000000     64000000.0  -1.0000  -1.

In [10]:
# make prediction with VW
! echo "? tag1| ad_id_144739 document_id_1337362 campaign_id_18488 advertiser_id_2909" > /mnt/test.txt
! echo "? tag2| ad_id_156824 document_id_992370 campaign_id_7283 advertiser_id_1919" >> /mnt/test.txt
! ./vw -d /mnt/test.txt -i model -t -k -p /mnt/predictions.txt --progress 1000000 --link=logistic
# predicted probabilities of "1" class
! cat /mnt/predictions.txt

only testing
predictions = /mnt/predictions.txt
Enabling FTRL based optimization
Algorithm used: Proximal-FTRL
ftrl_alpha = 0.005
ftrl_beta = 0.1
Num weight bits = 24
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = /mnt/test.txt
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features

finished run
number of examples = 2
weighted example sum = 2.000000
weighted label sum = 0.000000
average loss = 5.674418
total feature number = 10
0.319263 tag1
0.036173 tag2


# Homework 2: Baseline model

Train a baseline model (**any model** you can think of, not necessarily VW) using the following features (features will bring you **0.5 points each**, you can think of your own features, **maximum 8 points** for this task):
- **clicked**
- geo_location features (country, state, dma)
- day_of_week (from timestamp, use *date.isoweekday()*)
- ad_id
- campaign_id
- advertiser_id
- ad_document_id
- display_document_id
- platform

Make submission to Kaggle to know your leaderboard score

If you want to create a dev set, make a 90%/10% split of training data by display_id

In [115]:
from pyspark.sql.functions import split, date_format, from_unixtime, unix_timestamp, col
from pyspark.sql.types import StringType

In [109]:
# Compute top category

top_cat = se.sql("""
select 
    documents_categories.document_id,
    category_id
from documents_categories
join (select document_id, 
        max(confidence_level) max_conf
        from documents_categories
        group by document_id) max_cat on documents_categories.document_id = max_cat.document_id
                                        and documents_categories.confidence_level = max_cat.max_conf

""")
top_cat.registerTempTable("top_cat")

top_cat_no_duplicates = se.sql("""
select 
    document_id,
    first(category_id) category_id
from top_cat
group by document_id
""")
top_cat_no_duplicates.registerTempTable("top_cat_final")

In [110]:
# Compute top topic

top_topic = se.sql("""
select 
    documents_topics.document_id,
    topic_id
from documents_topics
join (select document_id, 
        max(confidence_level) max_conf
        from documents_topics
        group by document_id) max_topic on documents_topics.document_id = max_topic.document_id
                                        and documents_topics.confidence_level = max_topic.max_conf

""")
top_topic.registerTempTable("top_topic")

top_topic_no_duplicates = se.sql("""
select 
    document_id,
    first(topic_id) topic_id
from top_topic
group by document_id
""")
top_topic_no_duplicates.registerTempTable("top_topic_final")

In [133]:
# Compute top entity

top_entity = se.sql("""
select 
    documents_entities.document_id,
    entity_id
from documents_entities
join (select document_id, 
        max(confidence_level) max_conf
        from documents_entities
        group by document_id) max_entity on documents_entities.document_id = max_entity.document_id
                                        and documents_entities.confidence_level = max_entity.max_conf

""")
top_entity.registerTempTable("top_entity")

top_entity_no_duplicates = se.sql("""
select 
    document_id,
    first(entity_id) entity_id
from top_entity
group by document_id
""")
top_entity_no_duplicates.registerTempTable("top_entity_final")

In [150]:
# Initial dataset

dataset = se.sql("""
select 
    clicks_train.clicked,
    clicks_train.display_id,
    events.geo_location,
    events.timestamp,
    clicks_train.ad_id,
    promoted_content.campaign_id,
    promoted_content.advertiser_id,
    promoted_content.document_id ad_document_id,
    events.platform,
    events.uuid,
    events.document_id display_document_id,
    documents_meta.publisher_id,
    documents_meta.publish_time,
    top_cat_final.category_id,
    top_topic_final.topic_id,
    top_entity_final.entity_id
from clicks_train
join events on clicks_train.display_id = events.display_id
join promoted_content on clicks_train.ad_id = promoted_content.ad_id
join documents_meta on events.document_id = documents_meta.document_id
left join top_cat_final on events.document_id = top_cat_final.document_id
left join top_topic_final on events.document_id = top_topic_final.document_id
left join top_entity_final on events.document_id = top_entity_final.document_id
""")

In [151]:
# Split geo location to country, state and dma

split_col = split(dataset['geo_location'], '>')
dataset = dataset.withColumn('country',split_col.getItem(0))
dataset = dataset.withColumn('state',split_col.getItem(1))
dataset = dataset.withColumn('dma',split_col.getItem(2))
dataset = dataset.drop('geo_location')

In [152]:
# Add day of week & hour

date_time = from_unixtime(dataset['timestamp'])
dataset = dataset.withColumn('day_of_week',date_format(date_time, 'u'))
dataset = dataset.withColumn('hour',date_format(date_time, 'H'))

In [153]:
# Convert publish time to unixtime

unix_time = unix_timestamp(dataset['publish_time'])
dataset = dataset.withColumn('publish_time',unix_time)
dataset = dataset.withColumn("publish_time",col("publish_time").cast(StringType()))

In [154]:
# Fill nulls

dataset = dataset.fillna('0')

In [155]:
dataset.printSchema()

root
 |-- clicked: string (nullable = false)
 |-- display_id: string (nullable = false)
 |-- timestamp: string (nullable = false)
 |-- ad_id: string (nullable = false)
 |-- campaign_id: string (nullable = false)
 |-- advertiser_id: string (nullable = false)
 |-- ad_document_id: string (nullable = false)
 |-- platform: string (nullable = false)
 |-- uuid: string (nullable = false)
 |-- display_document_id: string (nullable = false)
 |-- publisher_id: string (nullable = false)
 |-- publish_time: string (nullable = false)
 |-- category_id: string (nullable = false)
 |-- topic_id: string (nullable = false)
 |-- entity_id: string (nullable = false)
 |-- country: string (nullable = false)
 |-- state: string (nullable = false)
 |-- dma: string (nullable = false)
 |-- day_of_week: string (nullable = false)
 |-- hour: string (nullable = false)



In [156]:
%%time
dataset.write.parquet("/train_features.parquet", mode='overwrite')

CPU times: user 25 ms, sys: 4.91 ms, total: 29.9 ms
Wall time: 4min 39s


In [157]:
se.read.parquet("/train_features.parquet").show(5)

+-------+----------+---------+------+-----------+-------------+--------------+--------+--------------+-------------------+------------+------------+-----------+--------+---------+-------+-----+---+-----------+----+
|clicked|display_id|timestamp| ad_id|campaign_id|advertiser_id|ad_document_id|platform|          uuid|display_document_id|publisher_id|publish_time|category_id|topic_id|entity_id|country|state|dma|day_of_week|hour|
+-------+----------+---------+------+-----------+-------------+--------------+--------+--------------+-------------------+------------+------------+-----------+--------+---------+-------+-----+---+-----------+----+
|      0|  12873428|848711482|131051|      16261|         2594|       1260918|       3|53b567c20a8eb2|            1000240|         522|  1450771200|       1505|     138|        0|     GB|    0|  0|          6|   1|
|      1|  12873428|848711482|134263|      16162|          185|       1300757|       3|53b567c20a8eb2|            1000240|         522|  145

In [158]:
r = se.read.parquet("/train_features.parquet").take(1)[0]
print(r)
print(vw_row_mapper(r))

Row(clicked='0', display_id='12873428', timestamp='848711482', ad_id='131051', campaign_id='16261', advertiser_id='2594', ad_document_id='1260918', platform='3', uuid='53b567c20a8eb2', display_document_id='1000240', publisher_id='522', publish_time='1450771200', category_id='1505', topic_id='138', entity_id='0', country='GB', state='0', dma='0', day_of_week='6', hour='1')
-1 12873428_131051| display_id_12873428 timestamp_848711482 ad_id_131051 campaign_id_16261 advertiser_id_2594 ad_document_id_1260918 platform_3 uuid_53b567c20a8eb2 display_document_id_1000240 publisher_id_522 publish_time_1450771200 category_id_1505 topic_id_138 entity_id_0 country_GB state_0 dma_0 day_of_week_6 hour_1


In [159]:
%%time
! hdfs dfs -rm -r /train_features.txt
(
    se.read.parquet("/train_features.parquet")
    .rdd
    .map(vw_row_mapper)
    .saveAsTextFile("/train_features.txt")
)

Deleted /train_features.txt
CPU times: user 114 ms, sys: 17.3 ms, total: 131 ms
Wall time: 6min 18s


In [160]:
# copy file to local master node
! rm /mnt/train.txt
! hdfs dfs -getmerge /train_features.txt /mnt/train.txt
# preview local file
! head -n 5 /mnt/train.txt

-1 12873428_131051| display_id_12873428 timestamp_848711482 ad_id_131051 campaign_id_16261 advertiser_id_2594 ad_document_id_1260918 platform_3 uuid_53b567c20a8eb2 display_document_id_1000240 publisher_id_522 publish_time_1450771200 category_id_1505 topic_id_138 entity_id_0 country_GB state_0 dma_0 day_of_week_6 hour_1
1 12873428_134263| display_id_12873428 timestamp_848711482 ad_id_134263 campaign_id_16162 advertiser_id_185 ad_document_id_1300757 platform_3 uuid_53b567c20a8eb2 display_document_id_1000240 publisher_id_522 publish_time_1450771200 category_id_1505 topic_id_138 entity_id_0 country_GB state_0 dma_0 day_of_week_6 hour_1
-1 12873428_160698| display_id_12873428 timestamp_848711482 ad_id_160698 campaign_id_16162 advertiser_id_185 ad_document_id_1400277 platform_3 uuid_53b567c20a8eb2 display_document_id_1000240 publisher_id_522 publish_time_1450771200 category_id_1505 topic_id_138 entity_id_0 country_GB state_0 dma_0 day_of_week_6 hour_1
-1 12873428_406558| display_id_128734

In [161]:
! ./vw -d /mnt/train.txt -b 24 -c -k --ftrl --passes 1 -f model --holdout_off --loss_function logistic --random_seed 42 --progress 8000000

final_regressor = model
Enabling FTRL based optimization
Algorithm used: Proximal-FTRL
ftrl_alpha = 0.005
ftrl_beta = 0.1
Num weight bits = 24
learning rate = 0.5
initial_t = 0
power_t = 0.5
creating cache_file = /mnt/train.txt.cache
Reading datafile = /mnt/train.txt
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.456714 0.456714      8000000      8000000.0  -1.0000  -0.7078       20
0.451234 0.445753     16000000     16000000.0  -1.0000  -1.9923       20
0.450919 0.450289     24000000     24000000.0  -1.0000  -1.5590       20
0.447493 0.437217     32000000     32000000.0  -1.0000  -0.4229       20
0.446630 0.443175     40000000     40000000.0   1.0000  -0.2916       20
0.446722 0.447186     48000000     48000000.0  -1.0000  -1.6682       20
0.445380 0.437330     56000000     56000000.0  -1.0000  -2.0483       20
0.444175 0.435736     64000000     64000000.0  -1.0000  -2.

# Submitting to Kaggle

Obtain Kaggle API token: https://github.com/Kaggle/kaggle-api#api-credentials

Making a submission: https://github.com/Kaggle/kaggle-api#submit-to-a-competition

In [182]:
! mkdir ~/.kaggle
! touch ~/.kaggle/kaggle.json
! echo '{"username":"danarozenblum","key":"key"}' > ~/.kaggle/kaggle.json
! cat ~/.kaggle/kaggle.json
! chmod 600 /home/hadoop/.kaggle/kaggle.json

mkdir: cannot create directory ‘/home/hadoop/.kaggle’: File exists
{"username":"danarozenblum","key":"26053ae789e80ea95fef21790387768d"}


In [178]:
! aws s3 cp s3://ydatazian/sample_submission.csv .

download: s3://ydatazian/sample_submission.csv to ./sample_submission.csv


In [179]:
# https://www.kaggle.com/c/outbrain-click-prediction/overview/evaluation
# For each display_id in the test set, you must predict a space-delimited list of ad_ids, 
# ordered by decreasing likelihood of being clicked.
! head -n 5 ./sample_submission.csv

display_id,ad_id
16874594,66758 150083 162754 170392 172888 180797
16874595,8846 30609 143982
16874596,11430 57197 132820 153260 173005 288385 289122 289915
16874597,137858 143981 155945 180965 182039 285834 305790 308836


In [175]:
# Create test dataset

# Initial dataset
dataset_test = se.sql("""
select
    "0" as clicked,
    clicks_test.display_id,
    events.geo_location,
    events.timestamp,
    clicks_test.ad_id,
    promoted_content.campaign_id,
    promoted_content.advertiser_id,
    promoted_content.document_id ad_document_id,
    events.platform,
    events.uuid,
    events.document_id display_document_id,
    documents_meta.publisher_id,
    documents_meta.publish_time,
    top_cat_final.category_id,
    top_topic_final.topic_id,
    top_entity_final.entity_id
from clicks_test
join events on clicks_test.display_id = events.display_id
join promoted_content on clicks_test.ad_id = promoted_content.ad_id
join documents_meta on events.document_id = documents_meta.document_id
left join top_cat_final on events.document_id = top_cat_final.document_id
left join top_topic_final on events.document_id = top_topic_final.document_id
left join top_entity_final on events.document_id = top_entity_final.document_id
""")

# Split geo location to country, state and dma
split_col = split(dataset_test['geo_location'], '>')
dataset_test = dataset_test.withColumn('country',split_col.getItem(0))
dataset_test = dataset_test.withColumn('state',split_col.getItem(1))
dataset_test = dataset_test.withColumn('dma',split_col.getItem(2))
dataset_test = dataset_test.drop('geo_location')

# Add day of week & hour
date_time = from_unixtime(dataset_test['timestamp'])
dataset_test = dataset_test.withColumn('day_of_week',date_format(date_time, 'u'))
dataset_test = dataset_test.withColumn('hour',date_format(date_time, 'H'))

# Convert publish time to unixtime
unix_time = unix_timestamp(dataset_test['publish_time'])
dataset_test = dataset_test.withColumn('publish_time',unix_time)
dataset_test = dataset_test.withColumn("publish_time",col("publish_time").cast(StringType()))

# Fill nulls
dataset_test = dataset_test.fillna('0')

In [180]:
%%time
dataset_test.write.parquet("/test_features.parquet", mode='overwrite')

CPU times: user 8.78 ms, sys: 7.4 ms, total: 16.2 ms
Wall time: 2min 18s


In [183]:
%%time
! hdfs dfs -rm -r /test_features.txt
(
    se.read.parquet("/test_features.parquet")
    .rdd
    .map(vw_row_mapper)
    .saveAsTextFile("/test_features.txt")
)

Deleted /test_features.txt
CPU times: user 66 ms, sys: 15.4 ms, total: 81.5 ms
Wall time: 2min 28s


In [184]:
# copy file to local master node
! rm /mnt/test.txt
! hdfs dfs -getmerge /test_features.txt /mnt/test.txt
# preview local file
! head -n 5 /mnt/test.txt

-1 20285030_68743| display_id_20285030 timestamp_1154557712 ad_id_68743 campaign_id_5043 advertiser_id_1726 ad_document_id_874654 platform_1 uuid_f3e881f6f85843 display_document_id_1000495 publisher_id_9 publish_time_1450774800 category_id_1607 topic_id_174 entity_id_cf466fba71c45fffb360f9654866b8e2 country_NL state_11 dma_0 day_of_week_3 hour_22
-1 20285030_141476| display_id_20285030 timestamp_1154557712 ad_id_141476 campaign_id_18112 advertiser_id_2198 ad_document_id_1325589 platform_1 uuid_f3e881f6f85843 display_document_id_1000495 publisher_id_9 publish_time_1450774800 category_id_1607 topic_id_174 entity_id_cf466fba71c45fffb360f9654866b8e2 country_NL state_11 dma_0 day_of_week_3 hour_22
-1 20285030_204432| display_id_20285030 timestamp_1154557712 ad_id_204432 campaign_id_6825 advertiser_id_1726 ad_document_id_1518899 platform_1 uuid_f3e881f6f85843 display_document_id_1000495 publisher_id_9 publish_time_1450774800 category_id_1607 topic_id_174 entity_id_cf466fba71c45fffb360f9654

In [185]:
! ./vw -d /mnt/test.txt -i model -t -k -p /mnt/predictions.txt --progress 1000000 --link=logistic
# predicted probabilities of "1" class
! head -n 5 /mnt/predictions.txt

only testing
predictions = /mnt/predictions.txt
Enabling FTRL based optimization
Algorithm used: Proximal-FTRL
ftrl_alpha = 0.005
ftrl_beta = 0.1
Num weight bits = 24
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = /mnt/test.txt
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.822554 0.822554      1000000      1000000.0  -1.0000   0.1332       20
0.890899 0.959245      2000000      2000000.0  -1.0000   0.2587       20
0.937992 1.032177      3000000      3000000.0  -1.0000   0.2321       20
0.980220 1.106906      4000000      4000000.0  -1.0000   0.3498       20
1.102726 1.592748      5000000      5000000.0  -1.0000   0.2016       20
1.112239 1.159807      6000000      6000000.0  -1.0000   0.2019       20
1.098408 1.015421      7000000      7000000.0  -1.0000   0.1613       20
1.114053 1.223562      8000000      8000000.0  -1.0000   0.1979 

In [186]:
! wc -l /mnt/predictions.txt

32225162 /mnt/predictions.txt


In [187]:
from collections import defaultdict
scores_by_display_id = defaultdict(dict)
for line in tqdm.tqdm(open('/mnt/predictions.txt')):
    score, tag = line.strip().split(" ")
    score = float(score)
    display_id, ad_id = tag.split("_")
    scores_by_display_id[display_id][ad_id] = score

0it [00:00, ?it/s]

In [188]:
with open("submission.txt", "w") as f:
    f.write("display_id,ad_id\n")
    for k, vs in tqdm.tqdm_notebook(scores_by_display_id.items()):
        f.write("{},{}\n".format(
            k, 
            " ".join([v[0] for v in sorted(vs.items(), key=lambda x: -x[1])])
        ))

  0%|          | 0/6245533 [00:00<?, ?it/s]

In [191]:
! kaggle competitions submit -f submission.txt outbrain-click-prediction -m "baseline"

100%|████████████████████████████████████████| 260M/260M [00:03<00:00, 78.0MB/s]
Successfully submitted to Outbrain Click Prediction

In [None]:
# Private Score 0.64034
# Public Score 0.64004