In [1]:
import polars as pl
from tqdm import tqdm

# Features

In [2]:
df = pl.read_parquet("../data/processed/sessions-noncat.pq")

In [3]:
%%time
urls = (
    df
    .groupby(["user_id", "url_host"])
    .agg(pl.col("request_cnt").sum())
    .sort(["user_id", "request_cnt", "url_host"], reverse=[False, True, False])
    .groupby("user_id").agg(pl.col("url_host"))
)

CPU times: user 4min 51s, sys: 2min 15s, total: 7min 6s
Wall time: 1min 13s


In [68]:
def only_2nd_level_domain(url: str) -> str:
    domain_levels = url.split(".")
    if len(domain_levels) < 2:
        return ""
    return domain_levels[-2]

In [60]:
%%time
domains = (
    df
    .with_columns(pl.col("url_host").apply(only_2nd_level_domain).alias("second_level_domain"))
    .groupby(["user_id", "second_level_domain"])
    .agg(pl.col("request_cnt").sum())
    .sort(["user_id", "request_cnt", "second_level_domain"], reverse=[False, True, False])
    .groupby("user_id").agg(pl.col("second_level_domain").alias("urls"))
)

CPU times: user 6min 30s, sys: 4min 20s, total: 10min 50s
Wall time: 5min 21s


In [4]:
regions = (
    pl.read_parquet("../data/features/geo/top_region.pq")
    .with_columns(pl.col("top_region").apply(lambda x: "_".join(x.split())))
)
cities = (
    pl.read_parquet("../data/features/geo/top_city.pq")
    .with_columns(pl.col("top_city").apply(lambda x: "_".join(x.split())))
)

In [5]:
manufacturers = (
    pl.read_parquet("../data/features/device/manufacturer.pq")
    .with_columns(pl.col("device_manufacturer").apply(lambda x: "_".join(x.split())))
)
models = (
    pl.read_parquet("../data/features/device/model.pq")
    .with_columns(pl.col("device_model").apply(lambda x: "_".join(x.split())))
)

# Dataset

In [62]:
train = (
    pl.read_parquet("../data/processed/train-users.pq")    
    .join(regions, how="left", on="user_id")
    .join(cities, how="left", on="user_id")
    .join(manufacturers, how="left", on="user_id")
    .join(models, how="left", on="user_id")
    .join(domains, how="left", on="user_id")
    # .join(urls, how="left", on="user_id")
)

In [63]:
test = (
    pl.read_parquet("../data/processed/test-users.pq")
    .join(regions, how="left", on="user_id")
    .join(cities, how="left", on="user_id")
    .join(manufacturers, how="left", on="user_id")
    .join(models, how="left", on="user_id")
    .join(domains, how="left", on="user_id")
    # .join(urls, how="left", on="user_id")
)

In [64]:
train_sex = (
    train
    .filter(pl.col("is_male").is_not_null())
    .with_columns(pl.when(pl.col("is_male") == 0).then(-1).otherwise(1).alias("is_male"))
    .select(pl.exclude(["age", "age_bucket"]))
)

In [65]:
train_age = (
    train
    .filter(pl.col("age_bucket") > 0)
    .select(pl.exclude(["age", "is_male"]))
)

In [67]:
def to_vw_format(dataset: pl.DataFrame, path: str, target_col: str) -> None:
    with open(path, "w") as f:
        for row in tqdm(dataset.iter_rows(named=True), total=len(dataset)):
            target = f"{row[target_col]}" if target_col in row else " "
            
            geo = f"|g {row['top_region']} {row['top_city']}"
            device = f"|d {row['device_manufacturer']} {row['device_model']}"
            urls = f"|u {' '.join(row['urls'])}"
            # urls = f"|u {' '.join(map(lambda url: ' '.join(url.split('.')), row['url_host']))}"
            
            print(" ".join([target, geo, device, urls]), file=f)

In [69]:
to_vw_format(train_sex, path="../data/vw/2nd-level-domains/datasets/train-sex.vw", target_col="is_male")

100%|████████████████████████████████| 264326/264326 [00:13<00:00, 19670.51it/s]


In [70]:
to_vw_format(train_age, path="../data/vw/2nd-level-domains/datasets/train-age.vw", target_col="age_bucket")

100%|████████████████████████████████| 268922/268922 [00:14<00:00, 19197.80it/s]


In [71]:
to_vw_format(test, path="../data/vw/2nd-level-domains/datasets/test.vw", target_col="empty")

100%|████████████████████████████████| 144724/144724 [00:07<00:00, 19311.84it/s]


In [76]:
%%time
! vw ../data/vw/2nd-level-domains/datasets/train-sex.vw \
    --final_regressor ../data/vw/2nd-level-domains/models/sex.bin \
    --random_seed 777 \
    --bit_precision 24 \
    --loss_function logistic \
    --link logistic \
    --passes 40 \
    --cache -k \
    --interactions gd \
    --ngram u2 \
    --ngram u3

Generating 2-grams for u namespaces.
Generating 3-grams for u namespaces.
creating features for following interactions: gd 
final_regressor = ../data/vw/2nd-level-domains/models/sex.bin
Num weight bits = 24
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = ../data/vw/2nd-level-domains/datasets/train-sex.vw.cache
Reading datafile = ../data/vw/2nd-level-domains/datasets/train-sex.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.693147 0.693147            1            1.0   1.0000   0.5000       78
0.604189 0.515231            2            2.0   1.0000   0.5974       51
0.801367 0.998545            4            4.0  -1.0000   0.5716       48
0.768224 0.735081            8            8.0   1.0000   0.5221      114
0.752186 0.736148           16           16.0   1.0000   0.4835      141
0.688835 0.625484           32           32.0

In [77]:
%%time
! vw ../data/vw/2nd-level-domains/datasets/test.vw \
    --testonly \
    --initial_regressor ../data/vw/2nd-level-domains/models/sex.bin \
    --predictions ../data/vw/2nd-level-domains/predictions/sex.vw

Generating 2-grams for u namespaces.
Generating 3-grams for u namespaces.
creating features for following interactions: gd 
only testing
predictions = ../data/vw/2nd-level-domains/predictions/sex.vw
Num weight bits = 24
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ../data/vw/2nd-level-domains/datasets/test.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
    n.a.     n.a.            1            1.0  unknown   0.9425       87
    n.a.     n.a.            2            2.0  unknown   0.5945       12
    n.a.     n.a.            4            4.0  unknown   0.4582       30
    n.a.     n.a.            8            8.0  unknown   0.6644       66
    n.a.     n.a.           16           16.0  unknown   0.2793       36
    n.a.     n.a.           32           32.0  unknown   0.0021      402
    n.a.     n.a.           64           64.0  unkno

In [78]:
%%time
! vw ../data/vw/2nd-level-domains/datasets/train-age.vw \
    --final_regressor ../data/vw/2nd-level-domains/models/age.bin \
    --random_seed 777 \
    --loss_function logistic \
    --oaa 6 \
    --bit_precision 25 \
    --passes 100 \
    --cache -k \
    --interactions gd \
    --ngram u2 \
    --ngram u3

Generating 2-grams for u namespaces.
Generating 3-grams for u namespaces.
creating features for following interactions: gd 
final_regressor = ../data/vw/2nd-level-domains/models/age.bin
Num weight bits = 25
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = ../data/vw/2nd-level-domains/datasets/train-age.vw.cache
Reading datafile = ../data/vw/2nd-level-domains/datasets/train-age.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0        2        1       78
0.500000 0.000000            2            2.0        2        2       51
0.500000 0.500000            4            4.0        2        2       48
0.625000 0.750000            8            8.0        3        2      114
0.562500 0.500000           16           16.0        1        2      141
0.593750 0.625000           32           32.0

In [79]:
! vw ../data/vw/2nd-level-domains/datasets/test.vw \
    --testonly \
    --initial_regressor ../data/vw/2nd-level-domains/models/age.bin \
    --predictions ../data/vw/2nd-level-domains/predictions/age.vw

Generating 2-grams for u namespaces.
Generating 3-grams for u namespaces.
creating features for following interactions: gd 
only testing
predictions = ../data/vw/2nd-level-domains/predictions/age.vw
Num weight bits = 25
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ../data/vw/2nd-level-domains/datasets/test.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
    n.a.     n.a.            1            1.0  unknown        1       87
    n.a.     n.a.            2            2.0  unknown        3       12
    n.a.     n.a.            4            4.0  unknown        3       30
    n.a.     n.a.            8            8.0  unknown        2       66
    n.a.     n.a.           16           16.0  unknown        4       36
    n.a.     n.a.           32           32.0  unknown        2      402
    n.a.     n.a.           64           64.0  unkno

In [80]:
import pandas as pd

submission = pd.DataFrame()
submission["user_id"] = test["user_id"].to_pandas()
submission["age"] = pd.read_csv("../data/vw/2nd-level-domains/predictions/age.vw", header=None)[0]
submission["is_male"] = pd.read_csv("../data/vw/2nd-level-domains/predictions/sex.vw", header=None)[0]

In [82]:
submission.to_csv("../submissions/vw-2lvl-domains-u23.csv", index=False)