In [35]:
import pandas as pd
import numpy as np
import os
import math
from tqdm import tqdm_notebook
import itertools
from time import time
from sklearn.metrics import log_loss

In [36]:
PATH = './data'

In [37]:
!head -2 $PATH/train.csv

timestamp;label;C1;C2;C3;C4;C5;C6;C7;C8;C9;C10;CG1;CG2;CG3;l1;l2;C11;C12
1379278800;0;2733540231;3500392421;4454;15573;11;995;2;176;15;671;384,382,96,88,185,49,385,268,448,438,279,420,124,123,5,3,17,133,71,409,330,59,57,302,304,395,275,151,113,99,154,155,76,412,139,333,332,335,334,399;;;32;0;0;106


C1-12 - categorial features; 
l1-2 - counters; 
CG1-3 - groups of categorial features;

label - click or not click 1/0

In [4]:
!wc -l $PATH/train.csv

 29989753 ./data/train.csv


In [5]:
!wc -l $PATH/test.csv

 20317221 ./data/test.csv


In [125]:
train_data = os.path.join(PATH, 'train.csv')
train_len =  29989753-1
test_data = os.path.join(PATH, 'test.csv')
test_len = 20317221-1

In [127]:
def preprocess(file_in, file_out, file_len, test=False):
    with open(file_out, 'w') as f_w:
        with open(file_in, 'r') as f_r: 
            f_r.readline()
            for line in tqdm_notebook(f_r, total=file_len, mininterval=1):
                
                features = line.strip().split(';')
        
                # Label
                if test:
                    vw_line = str(features[1])
                else:
                    vw_line = "-1" if features[1] == "0" else str(features[1])
                    
                # Категориальные фичи 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 17, 18
                # 2, 3, 4, 5, 7 - много уник знач, 6, 9, 10, 11 - средне уник знач, 8 - 0/1/2, 17 - bin, 18 - мало уник знач
                # vw_line += " |cat"
                for i, feature in enumerate(features[2:12] + features[17:19]):
                    if len(feature) != 0:
                        vw_line += " |c{0} C{0}_{1}".format(i+1, feature)
                    else:
                        # Для тех категорий у которых нет значения создаем отдельную колонку в ohe представление
                        vw_line += " |c{0} C{0}_N".format(i+1)
                        
                # Счетчики 15, 16
                vw_line += " |i"
                if len(features[15]) != 0:
                    f15 = int(features[15])
                    log1 = np.log(f15 + 1)
                    vw_line += " l1:{}".format(str(log1))
                    
                if len(features[16]) != 0:
                    f16 = int(features[16])
                    log1 = np.log(f16 + 1)
                    vw_line += " l2:{}".format(str(log1))
                
                # Группы 12, 13, 14
                for i, feature in enumerate(features[12:15]):
                    group = feature.strip().split(",")
                    weight = 1.0 / math.sqrt(len(group) + 1.0) # Нормируем длину списка категорий
                    vw_line += " |g{}:{}".format(i, weight)
                    for value in group:
                        vw_line += " {}".format(value)
                            
                f_w.write(vw_line + "\n")

In [128]:
preprocess(train_data, os.path.join(PATH, 'vw/train.vw'), train_len)

A Jupyter Widget




In [151]:
preprocess(test_data, os.path.join(PATH, 'vw/test.vw'), test_len, test=True)

A Jupyter Widget




In [201]:
!head -n 1 $PATH/vw/train.vw

-1 |c1 C1_2733540231 |c2 C2_3500392421 |c3 C3_4454 |c4 C4_15573 |c5 C5_11 |c6 C6_995 |c7 C7_2 |c8 C8_176 |c9 C9_15 |c10 C10_671 |c11 C11_0 |c12 C12_106 |i l1:3.4965075614664802 l12:2.259913890602351 l2:0.0 l22:0.0 |g0:0.15617376188860607 384 382 96 88 185 49 385 268 448 438 279 420 124 123 5 3 17 133 71 409 330 59 57 302 304 395 275 151 113 99 154 155 76 412 139 333 332 335 334 399 |g1:0.7071067811865475  |g2:0.7071067811865475 


In [202]:
!head -n 1 $PATH/vw/test.vw

-1 |c1 C1_4210358866 |c2 C2_3196051971 |c3 C3_3336 |c4 C4_2094 |c5 C5_22 |c6 C6_995 |c7 C7_0 |c8 C8_361 |c9 C9_15 |c10 C10_566 |c11 C11_0 |c12 C12_103 |i l1:0.0 l12:0.0 l2:0.0 l22:0.0 |g0:0.16222142113076254 176 213 220 222 224 47 422 103 419 74 3 235 238 18 409 177 330 59 58 117 277 276 275 153 152 150 151 99 154 155 76 412 139 333 332 335 334 |g1:0.7071067811865475  |g2:0.7071067811865475 


In [175]:
SPLIT = int(train_len/2)
SPLIT

14994876

In [199]:
!split -l $SPLIT $PATH/vw/train.vw $PATH/vw/train_

In [177]:
!mv $PATH/vw/train_aa $PATH/vw/train_train.vw
!mv $PATH/vw/train_ab $PATH/vw/train_valid.vw
# !mv $PATH/split/train_ac $PATH/split/train_test.vw

mv: ./data/vw/train_ab: No such file or directory


In [178]:
!wc -l $PATH/vw/train_*.vw

   96337 ./data/vw/train_train.vw
 14994876 ./data/vw/train_valid.vw
 15091213 total


In [179]:
!cat $PATH/vw/train_valid.vw | cut -f 1 -d ' ' > $PATH/vw/train_valid_labels.txt
# !cat $PATH/vw/train_test.vw | cut -f 1 -d ' ' > $PATH/vw/train_test_labels.txt

In [184]:
y_valid = np.loadtxt(os.path.join(PATH, 'vw/train_valid_labels.txt'))
# y_test = np.loadtxt(os.path.join(PATH, 'vw/train_test_labels.txt'))

In [None]:
np.unique(y_valid, return_counts=True)

In [89]:
# 16 17
# l1 = !head -n 2000000 ./data/train.csv | awk '{split($0,a,";"); print a[17]}'

In [None]:
# awk -F"\t" '{print $1}' inputs.tsv

### Обучение

In [131]:
def train_vw_model(train_vw_file, model_filename, quiet=True,
                   ngram=1, passes=1, bit_precision=28):
    init_time = time()
    vw_call_string = ('vw {train_vw_file} ' + '-f {model_filename} -b {bit_precision} --random_seed 17' +
                      ' --loss_function logistic').format(
                       train_vw_file=train_vw_file, model_filename=model_filename, 
                       bit_precision=bit_precision)
    if ngram > 1:
         vw_call_string += ' --ngram {}'.format(ngram)     
    if passes > 1:
         vw_call_string += ' -k --passes {} --cache_file {}'.format(passes, model_filename.replace('.vw', '.cache'))
    if quiet:
        vw_call_string += ' --quiet'
    
    
    print(vw_call_string) 
    res = os.system(vw_call_string)
    print('{} sec.'.format(round(time() - init_time, 2)))

In [118]:
def test_vw_model(test_vw_file, model_filename, prediction_filename, true_labels, quiet=True):
    init_time = time()
    vw_call_string = ('vw -t -i {model_filename} {test_vw_file} --random_seed 17' + 
                       ' -p {prediction_filename}').format(
                       model_filename=model_filename, test_vw_file=test_vw_file, 
                       prediction_filename=prediction_filename)
    if quiet:
        vw_call_string += ' --quiet'
        
    print(vw_call_string) 
    res = os.system(vw_call_string)
    
    vw_pred = np.loadtxt(prediction_filename)
    print("log_loss: {}. Elapsed: {} sec.".format(
        round(log_loss(true_labels, vw_pred), 4), round(time() - init_time, 2)))

In [130]:
train_vw_model(os.path.join(PATH, 'vw/train_train.vw'),
               os.path.join(PATH, 'model/vw_model.vw'), 
               quiet=False,
               ngram=1, passes=2)

In [120]:
test_vw_model(os.path.join(PATH, 'vw/train_valid.vw'), os.path.join(PATH, 'model/vw_model.vw'), 
              os.path.join(PATH, 'vw/train_valid_pred.txt'), y_valid, quiet=False)

vw -t -i ./data/model/vw_model.vw ./data/vw/train_valid.vw --random_seed 17 -p ./data/vw/train_valid_pred.txt
log_loss: 0.07%. Elapsed: 23.38 sec.


In [None]:
########################

In [138]:
y_valid_prob = []
for x in y_valid:
    y_valid_prob.append(0.0 if x == -1 else x)

In [195]:
%%time
# !vw -d ./data/vw/train_train.vw \
!vw -d ./data/vw/train.vw \
-f ./data/model/vw_model.vw \
--loss_function logistic \
--bit_precision 28 \
--learning_rate 0.2 \
--passes 20 --kill_cache --cache_file ./data/model/vw_model.cache \
--power_t 0.1 \
--quiet

# --ignore g0 \
# --lrq 2 \
# --ftrl --ftrl_alpha 0.1 --ftrl_beta 1.5 

CPU times: user 36.9 s, sys: 1min 18s, total: 1min 55s
Wall time: 43min 57s


In [196]:
%%time
# !vw -d ./data/vw/train_valid.vw \
!vw -d ./data/vw/test.vw \
-t -i ./data/model/vw_model.vw \
--loss_function logistic \
--quiet \
-p ./data/vw/test_pred7.txt
# -p ./data/vw/valid_pred.txt

CPU times: user 3.02 s, sys: 7.02 s, total: 10 s
Wall time: 3min 52s


In [149]:
y_pred =  np.loadtxt(os.path.join(PATH, 'vw/valid_pred.txt'))
y_pred_prob = []
for x in y_pred:
    y_pred_prob.append(1/(1 + np.exp(-x)))

In [150]:
log_loss(y_valid_prob, y_pred_prob)

0.015164122319547087

### Submit

In [197]:
y_pred =  np.loadtxt(os.path.join(PATH, 'vw/test_pred7.txt'))

In [198]:
with open('./data/submit/sample_submit7.txt', 'w') as f_w:
    f_w.write('Id,Click\n')
    for i, label in enumerate(y_pred):
        prob = 1/(1 + np.exp(-label))
        f_w.write(str(i+1) + ',' + str(prob) + '\n')