In [1]:
import sqlite3

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
db = sqlite3.connect('../raw_db.db')

dct = pd.read_csv('../data/dict.csv').drop('Unnamed: 0', axis=1)

dct.columns = [
        'category1_id'
        , 'category2_id'
        , 'category3_id'
        , 'category1_name'
        , 'category2_name'
        , 'category3_name'
        , 'category'
]

dct.head()

Unnamed: 0,category1_id,category2_id,category3_id,category1_name,category2_name,category3_name,category
0,649,651,1043,Игры,Интеллектуальные игры,Брейн-ринг,Для ума
1,649,651,1040,Игры,Интеллектуальные игры,Викторины,Для ума
2,649,651,1042,Игры,Интеллектуальные игры,Иные интеллектуальные игры,Для ума
3,649,651,1041,Игры,Интеллектуальные игры,Квест,Для ума
4,649,650,516,Игры,Настольные игры,Иные настольные игры,Для ума


## Предсказание категорий

In [3]:
group_user_stats = pd.read_sql("""
    SELECT 
        category3_id as category_in
        , a.user_id as user_id
        , count(event_id) as n
        , max(is_woman) as is_woman
        , CAST((julianday('2023-03-01') - julianday(date_of_birth)) / 365 / 10 AS INT) * 10 as age_group
    FROM attend a
    INNER JOIN users u ON u.user_id = a.user_id
    INNER JOIN groups g ON a.group_id = g.group_id
    WHERE event_date < '2023-02-01'
        AND a.user_id % 5 = 0
    GROUP BY category_in, a.user_id
""", con=db).dropna().astype(int)

# group_info = pd.read_sql("""
#     SELECT group_id, category3_id as category_in
#     FROM groups
# """, con=db).dropna().astype(int) # (0)

# group_user_stats = group_user_stats.merge(group_info)

In [4]:
group_user_stats

Unnamed: 0,category_in,user_id,n,is_woman,age_group
0,102,101347130,55,1,70
1,102,101353115,2,1,60
2,102,101356375,52,1,60
3,102,101357310,2,1,60
4,102,101357600,10,1,70
...,...,...,...,...,...
52486,1813,101430280,5,1,70
52487,1813,101435935,2,1,60
52488,1813,101435950,3,0,60
52489,1813,101436515,3,1,70


In [5]:
age_rank = pd.read_sql("SELECT category3_id as category_out, is_woman, age_group, rank as rank_age FROM model_age_category", con=db).dropna()
age_rank.head()

Unnamed: 0,category_out,is_woman,age_group,rank_age
0,131,0,90,1
1,329,1,40,1
2,114,0,40,2
3,144,0,40,2
4,104,0,40,2


In [6]:
category_rank = pd.read_sql("""
SELECT i as category_in, j as category_out, rank as rank_cat 
FROM model_category_similarity
""", con=db).dropna()
category_rank.head()

Unnamed: 0,category_in,category_out,rank_cat
0,102,492,1.0
1,102,151,2.0
2,102,458,3.0
3,102,470,4.0
4,102,1016,5.0


In [7]:
category_rank = pd.concat([
    category_rank, 
    pd.DataFrame([(i, i, 0) for i in dct['category3_id']], columns=category_rank.columns)
])

In [8]:
n_visits = group_user_stats.groupby('user_id', as_index=False).agg({'n': 'sum'})
n_visits.columns = ['user_id', 'user_visit_total']

In [9]:
group_user_stats = group_user_stats.merge(category_rank)

In [10]:
group_user_stats['is_same'] = (group_user_stats['category_out'] == group_user_stats['category_in']) * group_user_stats['n']

In [14]:
group_user_stats['rank_weight'] = group_user_stats['rank_cat'] * group_user_stats['n']

In [15]:
group_user_stats_grouped = group_user_stats.groupby(['user_id', 'category_out'], as_index=False).agg({
    'age_group': 'first',
    'is_woman': 'first',
    'n': 'sum',
    'rank_cat': 'sum',
    'category_in': 'count',
    'is_same': 'max',
})

In [16]:
group_user_stats_grouped = group_user_stats_grouped.merge(n_visits)
group_user_stats_grouped.head()

Unnamed: 0,user_id,category_out,age_group,is_woman,n,rank_cat,category_in,is_same,user_visit_total
0,101346565,102,60,1,30,281.0,2,0,30
1,101346565,104,60,1,30,68.0,2,0,30
2,101346565,111,60,1,13,244.0,1,0,30
3,101346565,112,60,1,30,60.0,2,0,30
4,101346565,114,60,1,30,33.0,2,0,30


In [17]:
group_user_stats_grouped['norm_rank_cat'] = (-1 * group_user_stats_grouped['rank_cat'] / group_user_stats_grouped['n']) / group_user_stats_grouped['user_visit_total']

In [18]:
group_user_stats_grouped = group_user_stats_grouped.merge(
    group_user_stats_grouped[['user_id', 'age_group', 'is_woman']].drop_duplicates().merge(age_rank)
    , how='outer'
)

In [19]:
group_user_stats_grouped = group_user_stats_grouped[['user_id', 'category_out', 'is_same', 'norm_rank_cat', 'rank_age']]

In [20]:
group_user_stats_grouped = group_user_stats_grouped.fillna({'is_same': 0, 'norm_rank_cat': 50})

In [21]:
group_user_stats_grouped

Unnamed: 0,user_id,category_out,is_same,norm_rank_cat,rank_age
0,101346565,102,0.0,-0.312222,131
1,101346565,104,0.0,-0.075556,8
2,101346565,111,0.0,-0.625641,266
3,101346565,112,0.0,-0.066667,13
4,101346565,114,0.0,-0.036667,5
...,...,...,...,...,...
2887879,101440140,1469,0.0,50.000000,162
2887880,101440140,159,0.0,50.000000,162
2887881,101440140,466,0.0,50.000000,162
2887882,101440140,244,0.0,50.000000,162


In [22]:
group_user_stats_target = pd.read_sql("""
    SELECT 
        category3_id as category_out
        , a.user_id as user_id
        , count(a.user_id) as n_occurred
        , 1 as has_occurred
    FROM attend a
    INNER JOIN users u ON u.user_id = a.user_id
    INNER JOIN groups g ON a.group_id = g.group_id
    WHERE event_date >= '2023-02-01'
        AND a.user_id % 5 = 0
    GROUP BY category_out, a.user_id
""", con=db).dropna().astype(int)

In [23]:
group_user_stats_data = group_user_stats_grouped.merge(group_user_stats_target, on=['user_id', 'category_out'], how='left')

In [24]:
group_user_stats_data = group_user_stats_data[group_user_stats_data['user_id'].isin(group_user_stats_target['user_id'].unique())]

In [25]:
group_user_stats_data = group_user_stats_data.fillna(0)

In [26]:
group_user_stats_data.head(10)

Unnamed: 0,user_id,category_out,is_same,norm_rank_cat,rank_age,n_occurred,has_occurred
0,101346565,102,0.0,-0.312222,131,0.0,0.0
1,101346565,104,0.0,-0.075556,8,0.0,0.0
2,101346565,111,0.0,-0.625641,266,0.0,0.0
3,101346565,112,0.0,-0.066667,13,0.0,0.0
4,101346565,114,0.0,-0.036667,5,0.0,0.0
5,101346565,115,0.0,-0.076667,48,0.0,0.0
6,101346565,118,0.0,-0.352222,110,0.0,0.0
7,101346565,119,0.0,-0.06,11,0.0,0.0
8,101346565,120,0.0,-0.064444,18,0.0,0.0
9,101346565,121,0.0,-0.012222,6,0.0,0.0


In [27]:
from sklearn.model_selection import train_test_split

In [28]:
users_train, users_test = train_test_split(group_user_stats_data['user_id'].unique(), test_size=0.2)

In [29]:
i_train = group_user_stats_data['user_id'].isin(set(users_train))
i_test = group_user_stats_data['user_id'].isin(set(users_test))

X_train = group_user_stats_data[i_train].set_index(['user_id', 'category_out'])[['norm_rank_cat', 'rank_age', 'is_same']]
X_test = group_user_stats_data[i_test].set_index(['user_id', 'category_out'])[['norm_rank_cat', 'rank_age', 'is_same']]
y_train = group_user_stats_data[i_train].set_index(['user_id', 'category_out'])['has_occurred']
y_test = group_user_stats_data[i_test].set_index(['user_id', 'category_out'])['has_occurred']

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [31]:
model = LogisticRegression(class_weight='balanced')

model.fit(X_train, y_train)

In [32]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99    517604
         1.0       0.52      0.87      0.65      5614

    accuracy                           0.99    523218
   macro avg       0.76      0.93      0.82    523218
weighted avg       0.99      0.99      0.99    523218



In [33]:
x_see = X_test.copy()
x_see['n_occurred'] = group_user_stats_data[group_user_stats_data['user_id'].isin(set(users_test))]['n_occurred'].values
x_see['has_occurred'] = group_user_stats_data[group_user_stats_data['user_id'].isin(set(users_test))]['has_occurred'].values
x_see['predict'] = model.predict_proba(X_test)[:, 1]
x_see = x_see.reset_index()

In [34]:
# sns.lmplot('norm_rank_cat', 'rank_age', data=x_see)

In [35]:
x_see.head()

Unnamed: 0,user_id,category_out,norm_rank_cat,rank_age,is_same,n_occurred,has_occurred,predict
0,101346645,102,-0.028979,124,0.0,0.0,0.0,0.065431
1,101346645,104,-0.000865,8,0.0,0.0,0.0,0.441521
2,101346645,111,-0.048227,268,0.0,0.0,0.0,0.003442
3,101346645,112,-0.002595,19,0.0,0.0,0.0,0.385832
4,101346645,114,-0.000433,7,0.0,0.0,0.0,0.44668


In [36]:
x_see[['norm_rank_cat', 'rank_age', 'n_occurred']].corr()

Unnamed: 0,norm_rank_cat,rank_age,n_occurred
norm_rank_cat,1.0,0.189187,-0.007837
rank_age,0.189187,1.0,-0.113672
n_occurred,-0.007837,-0.113672,1.0


In [37]:
x_see = x_see.sort_values(by='predict', ascending=False).groupby('user_id', as_index=False).agg({
    'predict': list,
    'category_out': list,
    'rank_age': list,
    'norm_rank_cat': list,
})

In [38]:
x_see.columns = ['user_id', 'predict', 'category_pred', 'age', 'cat']

In [39]:
# x_see['n_predictions'] = x_see['category_out'].apply(len)
# x_see['ok'] = x_see[['category_out', 'has_occurred']].apply(
#     lambda x: [i for i, v in enumerate(zip(x['category_out'], x['has_occurred'])) if v[1] == 1]
#     , axis=1
# )

In [40]:
group_user_stats_target_all = group_user_stats_target.sort_values(by='n_occurred', ascending=False).groupby('user_id', as_index=False).agg({
    'n_occurred': list,
    'category_out': list,
    'category_out': list
})

In [41]:
x_see = x_see.merge(group_user_stats_target_all)

In [42]:
x_see['share_uq'] = x_see.apply(
    lambda x: np.average([1 if real in x['category_pred'][:10] else 0 for real in x['category_out'][:10]])
    , axis=1
)

In [43]:
x_see['share_uq'].mean()

0.9074260200928722

In [44]:
x_see['share_n'] = x_see.apply(
    lambda x: 
    np.sum([n if real in x['category_pred'][:10] else 0 for real, n in zip(x['category_out'], x['n_occurred'])]) /
    np.sum(x['n_occurred'][:10])
    , axis=1
)
x_see['share_n'].mean()

0.934457649818282

In [45]:
print(model.intercept_)
for n, c in zip(X_train.columns, model.coef_[0]):
    print(c, n)

[-0.06780895]
-0.002193173948791903 norm_rank_cat
-0.02089799488720552 rank_age
0.8740604068970466 is_same


In [46]:
from collections import Counter, defaultdict

In [47]:
extra = Counter()
missed = Counter()
correct = Counter()

for i in x_see.index:
    row = x_see.loc[i]
    pred = set(row['category_pred'][:10])
    real = set(row['category_out'])
    correct += Counter(pred & real)
    extra += Counter(pred - real)
    missed += Counter(real - pred)

In [48]:
dct[dct['category3_id'].isin([i[0] for i in extra.most_common(10)])]

Unnamed: 0,category1_id,category2_id,category3_id,category1_name,category2_name,category3_name,category
13,631,633,131,Образование,Здорово жить,Здорово жить,Для ума
38,631,630,122,Образование,Информационные технологии,Осваиваем мобильные устройства,Для ума
43,631,634,121,Образование,"История, искусство, краеведение",История искусства,Для ума
81,631,1363,1369,Образование,Пеший лекторий,Краеведение и пешие прогулки,Для ума
376,608,612,329,Творчество,Красота и стиль,Мастер-класс по уходу за кожей в зрелом возрасте,Для души
431,589,596,130,Физическая активность,Гимнастика,Гимнастика,Для тела
434,589,596,114,Физическая активность,Гимнастика,Здоровая спина,Для тела
438,589,596,142,Физическая активность,Гимнастика,Суставная гимнастика,Для тела
448,589,591,104,Физическая активность,ОФП,ОФП,Для тела
451,589,597,139,Физическая активность,Скандинавская ходьба,Скандинавская ходьба,Для тела


In [49]:
correct.most_common(10)

[(130, 256),
 (329, 242),
 (142, 190),
 (114, 159),
 (139, 149),
 (104, 133),
 (1421, 122),
 (1369, 120),
 (122, 119),
 (168, 115)]

In [50]:
missed.most_common(10)

[(119, 22),
 (1421, 21),
 (1213, 20),
 (1715, 19),
 (131, 18),
 (121, 18),
 (1230, 16),
 (120, 16),
 (142, 15),
 (112, 13)]

In [44]:
extra.most_common(10)

[(130, 6775),
 (142, 6223),
 (1369, 5704),
 (329, 5545),
 (114, 5400),
 (121, 4366),
 (104, 4056),
 (122, 3189),
 (131, 2953),
 (139, 2128)]

In [45]:
x_see

Unnamed: 0,user_id,predict,category_pred,age,cat,n_occurred,category_out,share_uq,share_n
0,101346567,"[0.9999999999999998, 0.9999999594762735, 0.478...","[157, 145, 130, 329, 142, 1369, 114, 121, 131,...","[50, 38, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12...","[0.6290322580645161, 0.7580645161290323, 0.225...","[7, 7]","[145, 157]",1.000000,1.000000
1,101346579,"[1.0, 1.0, 0.981688786636945, 0.47879003042703...","[119, 142, 123, 130, 329, 1369, 114, 121, 131,...","[11, 3, 62, 1, 2, 4, 5, 6, 7, 8, 9, 10, 12, 13...","[0.2446043165467626, 0.14388489208633093, 1.02...","[7, 7]","[142, 119]",1.000000,1.000000
2,101346581,"[1.0, 0.9999687434367829, 0.8347322632801679, ...","[179, 177, 114, 145, 130, 329, 142, 1369, 121,...","[51, 56, 5, 38, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11...","[0.9102564102564102, 0.8717948717948718, 0.294...","[7, 2, 1]","[179, 192, 145]",0.666667,0.800000
3,101346604,"[0.9999999994286104, 0.47878328576836804, 0.47...","[139, 130, 122, 142, 1369, 329, 114, 104, 121,...","[6, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14...","[0.0, 0.125, 0.25, 0.2916666666666667, 0.08333...",[4],[139],1.000000,1.000000
4,101346621,"[1.0, 0.9999999999995925, 0.9999688217994617, ...","[114, 329, 177, 148, 455, 130, 142, 1369, 121,...","[5, 2, 56, 26, 72, 1, 3, 4, 6, 7, 8, 9, 10, 11...","[0.5169491525423728, 0.9322033898305084, 1.957...","[5, 4, 2, 2]","[114, 130, 455, 329]",1.000000,1.000000
...,...,...,...,...,...,...,...,...,...
9020,101448277,"[0.6891822526119995, 0.4804421798601842, 0.473...","[122, 130, 142, 1369, 139, 329, 114, 104, 121,...","[2, 1, 3, 4, 6, 5, 7, 8, 9, 10, 11, 13, 12, 14...","[3.5, 3.0, 9.5, 8.5, 21.0, 8.0, 15.0, 21.0, 7....","[7, 4]","[122, 1277]",0.500000,0.636364
9021,101448279,"[0.5840047020875341, 0.4893948823988807, 0.485...","[141, 114, 329, 130, 1286, 142, 1369, 1213, 19...","[23, 3, 1, 2, 7, 4, 5, 6, 10, 9, 8, 13, 11, 14...","[0.0, 37.0, 12.0, 15.0, 50.0, 16.0, 24.0, 17.0...","[12, 8]","[141, 1111]",0.500000,0.600000
9022,101448281,"[0.6348386265174426, 0.4843398993688773, 0.480...","[119, 122, 130, 142, 1369, 139, 329, 104, 114,...","[13, 2, 1, 3, 4, 6, 5, 8, 7, 9, 10, 12, 11, 15...","[0.0, 19.0, 3.0, 4.0, 13.0, 23.0, 2.0, 16.0, 5...","[6, 2]","[119, 120]",0.500000,0.750000
9023,101448288,"[0.6495747213058821, 0.48217365910309806, 0.47...","[192, 329, 130, 114, 142, 1369, 1213, 1286, 16...","[10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14...","[0.0, 6.0, 2.0, 1.0, 3.0, 9.0, 14.0, 18.0, 17....","[6, 4, 3, 3, 1, 1, 1]","[1289, 1724, 192, 1421, 1704, 1788, 1230]",0.142857,0.157895


In [46]:
# x_see['ok'].apply(lambda x: sum(1 for i in x if i < 10)).mean()

In [47]:
# x_see['ok'].apply(lambda x: sum(1 for i in x if i < 10)).mean()

In [48]:
# x_see['ok'].apply(lambda x: sum(1 for i in x)).mean()