In [2]:
import pandas as pd
import polars as pl

In [3]:
df = pl.read_parquet("../data/processed/sessions-noncat.pq")

In [4]:
user_url_interactions = df.groupby(["user_id", "url_host"]).agg(pl.col("request_cnt").sum().alias("n_requests"))

In [38]:
import scipy.sparse as sp
import numpy as np
from tqdm import tqdm

In [10]:
n_users = user_url_interactions["user_id"].n_unique()
n_urls = user_url_interactions["url_host"].n_unique()

In [31]:
%%time
user_id_to_idx_mapping = (
    user_url_interactions["user_id"]
    .unique()
    .to_frame()
    .with_row_count()
    .select(["user_id", pl.col("row_nr").alias("idx")])
    .to_pandas()
    .set_index("user_id")["idx"]
    .to_dict()
)

CPU times: user 3.21 s, sys: 287 ms, total: 3.5 s
Wall time: 637 ms


In [32]:
%%time
url_to_idx_mapping = (
    user_url_interactions["url_host"]
    .unique()
    .to_frame()
    .with_row_count()
    .select(["url_host", pl.col("row_nr").alias("idx")])
    .to_pandas()
    .set_index("url_host")["idx"]
    .to_dict()
)

CPU times: user 3.12 s, sys: 463 ms, total: 3.59 s
Wall time: 3.58 s


In [33]:
interactions = sp.dok_matrix((n_users, n_urls), dtype=np.int32)

In [40]:
for user_id, url_host, n_requests in tqdm(user_url_interactions.iter_rows(), total=len(user_url_interactions)):
    interactions[user_id_to_idx_mapping[user_id], url_to_idx_mapping[url_host]] = n_requests

100%|████████████████████████████| 32277669/32277669 [05:33<00:00, 96887.50it/s]


In [42]:
user_id_to_idx_mapping[user_id], url_to_idx_mapping[url_host]

(311078, 148848)

In [43]:
interactions[(311078, 148848)]

1

In [46]:
%%time
interactions = interactions.tocsr()

CPU times: user 17.5 s, sys: 1.34 s, total: 18.8 s
Wall time: 18.8 s


In [50]:
sp.save_npz("../data/features/interactions/user-url.npz", interactions)

In [52]:
! ls -lh ../data/features/interactions

total 79M
-rw-rw-r-- 1 ababkin ababkin 79M Feb 19 13:35 user-url.npz


In [53]:
from sklearn.feature_extraction.text import TfidfTransformer

In [54]:
from sklearn.naive_bayes import MultinomialNB

In [55]:
sex_mnb = MultinomialNB()

In [57]:
train = pd.read_parquet("../data/processed/train-users.pq")
test = pd.read_parquet("../data/processed/test-users.pq")

In [58]:
train_sex = train.loc[train["is_male"].notnull(), ["user_id", "is_male"]]
train_sex["idx"] = train_sex["user_id"].map(user_id_to_idx_mapping)

In [67]:
train_age = train.loc[train["age_bucket"] > 0, ["user_id", "age_bucket"]]
train_age["idx"] = train_age["user_id"].map(user_id_to_idx_mapping)

In [68]:
test["idx"] = test["user_id"].map(user_id_to_idx_mapping)

In [69]:
x_train_sex = interactions[train_sex["idx"]]
y_train_sex = train_sex["is_male"]
x_test_sex = interactions[test["idx"]]

x_train_age = interactions[train_age["idx"]]
y_train_age = train_age["age_bucket"]
x_test_age = interactions[test["idx"]]

In [71]:
from sklearn import metrics as m

In [70]:
mnb_sex = MultinomialNB()
mnb_sex.fit(x_train_sex, y_train_sex)

In [73]:
m.roc_auc_score(train_sex["is_male"], mnb_sex.predict_proba(x_train_sex)[:, 1])

0.7384432604399547

In [74]:
mnb_age = MultinomialNB()
mnb_age.fit(x_train_age, y_train_age)

In [77]:
mnb_age.predict(x_train_age)

array([3., 6., 1., ..., 1., 2., 3.])

In [78]:
print(
    m.classification_report(
        train_age["age_bucket"].to_numpy().astype(int), 
        mnb_age.predict(x_train_age),
        target_names=['18-25','25-34', '35-44', '45-54', '55-65', '65+']
    )
)

              precision    recall  f1-score   support

       18-25       0.29      0.61      0.39     32641
       25-34       0.43      0.20      0.27     87270
       35-44       0.35      0.30      0.32     77486
       45-54       0.25      0.27      0.26     42442
       55-65       0.24      0.44      0.31     23580
         65+       0.15      0.17      0.16      5503

    accuracy                           0.31    268922
   macro avg       0.29      0.33      0.29    268922
weighted avg       0.34      0.31      0.30    268922



In [79]:
submission = test[["user_id"]].copy()
submission["age"] = mnb_age.predict(x_test_sex).astype(int)
submission["is_male"] = mnb_sex.predict_proba(x_test_sex)[:, 1]

In [83]:
submission.to_csv("../submissions/cv+mnb-baseline.csv", index=False)

In [89]:
train.loc[(19 <= train["age"]) & (train["age"] <= 25), "age_bucket"].unique()

<IntegerArray>
[1]
Length: 1, dtype: UInt8

In [90]:
train.loc[(26 <= train["age"]) & (train["age"] <= 35), "age_bucket"].unique()

<IntegerArray>
[2]
Length: 1, dtype: UInt8

In [91]:
train.loc[(36 <= train["age"]) & (train["age"] <= 45), "age_bucket"].unique()

<IntegerArray>
[3]
Length: 1, dtype: UInt8

In [92]:
train.loc[(46 <= train["age"]) & (train["age"] <= 55), "age_bucket"].unique()

<IntegerArray>
[4]
Length: 1, dtype: UInt8

In [93]:
train.loc[(56 <= train["age"]) & (train["age"] <= 65), "age_bucket"].unique()

<IntegerArray>
[5]
Length: 1, dtype: UInt8

In [94]:
train.loc[(66 <= train["age"]) & (train["age"] <= 100500), "age_bucket"].unique()

<IntegerArray>
[6]
Length: 1, dtype: UInt8

In [95]:
tfidf = TfidfTransformer()

In [96]:
interactions_tfidf = tfidf.fit_transform(interactions)

In [99]:
x_train_sex = interactions_tfidf[train_sex["idx"]]
y_train_sex = train_sex["is_male"]
x_test_sex = interactions_tfidf[test["idx"]]

x_train_age = interactions_tfidf[train_age["idx"]]
y_train_age = train_age["age_bucket"]
x_test_age = interactions_tfidf[test["idx"]]

In [100]:
mnb_sex = MultinomialNB()
mnb_sex.fit(x_train_sex, y_train_sex)

In [102]:
m.roc_auc_score(train_sex["is_male"], mnb_sex.predict_proba(x_train_sex)[:, 1])

0.8112053817221887

In [101]:
mnb_age = MultinomialNB()
mnb_age.fit(x_train_age, y_train_age)

In [103]:
print(
    m.classification_report(
        train_age["age_bucket"].to_numpy().astype(int), 
        mnb_age.predict(x_train_age),
        target_names=['18-25','25-34', '35-44', '45-54', '55-65', '65+']
    )
)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       18-25       0.80      0.00      0.00     32641
       25-34       0.41      0.82      0.55     87270
       35-44       0.36      0.44      0.40     77486
       45-54       0.33      0.01      0.01     42442
       55-65       0.52      0.01      0.01     23580
         65+       0.00      0.00      0.00      5503

    accuracy                           0.39    268922
   macro avg       0.40      0.21      0.16    268922
weighted avg       0.43      0.39      0.29    268922



  _warn_prf(average, modifier, msg_start, len(result))


In [104]:
submission = test[["user_id"]].copy()
submission["age"] = mnb_age.predict(x_test_sex).astype(int)
submission["is_male"] = mnb_sex.predict_proba(x_test_sex)[:, 1]

In [105]:
submission.to_csv("../submissions/tfidf+mnb-baseline.csv", index=False)

In [110]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
logreg = LogisticRegression()

In [137]:
%%time
logreg_sex = LogisticRegressionCV(
    Cs=[0.05, 0.5, 5, 10, 20, 50, 100, 1000],
    max_iter=10_000,
    scoring="roc_auc",
    random_state=777,
    n_jobs=-1,
)
logreg_sex.fit(x_train_sex, y_train_sex)

CPU times: user 1h 6min 32s, sys: 1h 32min 54s, total: 2h 39min 27s
Wall time: 38min 8s


In [139]:
m.roc_auc_score(train_sex["is_male"], logreg_sex.predict_proba(x_train_sex)[:, 1])

0.8707608277688417

In [140]:
pd.DataFrame(logreg_sex.scores_[1.0], columns=logreg_sex.Cs_)

Unnamed: 0,0.05,0.50,5.00,10.00,20.00,50.00,100.00,1000.00
0,0.773372,0.817189,0.836977,0.839293,0.840074,0.83891,0.836456,0.818789
1,0.767416,0.81188,0.833244,0.835928,0.837142,0.836654,0.834598,0.817253
2,0.770706,0.814282,0.834527,0.836956,0.838019,0.837497,0.835486,0.817883
3,0.768215,0.812088,0.832036,0.834295,0.835001,0.833777,0.831256,0.813554
4,0.769037,0.813405,0.833838,0.836573,0.837892,0.837371,0.835241,0.817973


In [144]:
%%time
logreg_age = LogisticRegressionCV(
    Cs=[0.05, 0.5, 3, 5, 10, 100],
    max_iter=10_000, 
    scoring="f1_weighted",
    random_state=777,
    n_jobs=-1,
)
logreg_age.fit(x_train_age, y_train_age)

CPU times: user 2h 43min 42s, sys: 3h 20min 54s, total: 6h 4min 37s
Wall time: 1h 58min 35s


In [145]:
print(
    m.classification_report(
        train_age["age_bucket"].to_numpy().astype(int), 
        logreg_age.predict(x_train_age),
        target_names=['18-25','25-34', '35-44', '45-54', '55-65', '65+']
    )
)

              precision    recall  f1-score   support

       18-25       0.61      0.38      0.46     32641
       25-34       0.53      0.71      0.60     87270
       35-44       0.46      0.56      0.51     77486
       45-54       0.47      0.28      0.36     42442
       55-65       0.54      0.24      0.33     23580
         65+       0.74      0.07      0.12      5503

    accuracy                           0.51    268922
   macro avg       0.56      0.37      0.40    268922
weighted avg       0.51      0.51      0.49    268922



In [147]:
pd.DataFrame(sum(logreg_age.scores_.values()) / len(logreg_age.scores_), columns=logreg_age.Cs_)

Unnamed: 0,0.05,0.50,3.00,5.00,10.00,100.00
0,0.3679,0.397398,0.407507,0.407621,0.40842,0.401942
1,0.372047,0.401129,0.41053,0.410503,0.411582,0.405949
2,0.367805,0.396387,0.408121,0.40901,0.409015,0.403008
3,0.364925,0.394119,0.403832,0.405826,0.406967,0.399794
4,0.371788,0.396856,0.408561,0.409097,0.409274,0.403362


In [148]:
submission = test[["user_id"]].copy()
submission["age"] = logreg_age.predict(x_test_sex).astype(int)
submission["is_male"] = logreg_sex.predict_proba(x_test_sex)[:, 1]

In [149]:
submission.to_csv("../submissions/tfidf+logregcv-baseline-2.csv", index=False)

In [150]:
from sklearn.neighbors import KNeighborsClassifier

In [152]:
knn_sex = KNeighborsClassifier(n_jobs=-1)

In [153]:
knn_sex.fit(x_train_sex, y_train_sex)