In [13]:
import sqlite3

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

In [14]:
db = sqlite3.connect('../raw_db.db')

dct = pd.read_csv('../data/dict.csv').drop('Unnamed: 0', axis=1)

dct.columns = [
        'category1_id'
        , 'category2_id'
        , 'category3_id'
        , 'category1_name'
        , 'category2_name'
        , 'category3_name'
        , 'category'
]

dct.head()

Unnamed: 0,category1_id,category2_id,category3_id,category1_name,category2_name,category3_name,category
0,649,651,1043,Игры,Интеллектуальные игры,Брейн-ринг,Для ума
1,649,651,1040,Игры,Интеллектуальные игры,Викторины,Для ума
2,649,651,1042,Игры,Интеллектуальные игры,Иные интеллектуальные игры,Для ума
3,649,651,1041,Игры,Интеллектуальные игры,Квест,Для ума
4,649,650,516,Игры,Настольные игры,Иные настольные игры,Для ума


## Совместная встречаемость категорий

In [7]:
group_user_stats = pd.read_sql("""
    SELECT 
        group_id
        , attend.user_id as user_id
        , count(event_id) as n
        , postal_code
        , max(is_woman) as is_woman
        , (julianday('2023-03-01') - julianday(date_of_birth)) / 365 as age
    FROM attend
    LEFT JOIN users u ON u.user_id = attend.user_id
    WHERE event_date < '2023-02-01'
    GROUP BY group_id, attend.user_id
""", con=db)

group_info = pd.read_sql("""
    SELECT group_id, category1_id, category2_id, category3_id, is_online, is_mobile
    FROM groups
""", con=db).dropna().astype(int) # (0)

group_user_stats = group_user_stats.merge(group_info)

In [8]:
cat3_user = group_user_stats.groupby(['category3_id', 'user_id'], as_index=False).agg({'n': 'sum'})

In [9]:
cat3_user['category3_id'].nunique()

291

In [10]:
from collections import defaultdict
from itertools import combinations
from math import log2

In [11]:
pairwise_dict = defaultdict(lambda: defaultdict(int))
uni_dict = defaultdict(int)

for row in tqdm(cat3_user.groupby('user_id').agg({'category3_id': list})['category3_id'].values):
    for (i, j) in combinations(row, 2):
        pairwise_dict[i][j] += 1
        pairwise_dict[j][i] += 1
        uni_dict[i] += 1
        uni_dict[j] += 1

  0%|          | 0/49831 [00:00<?, ?it/s]

In [12]:
result_dict = defaultdict(dict)
for i in pairwise_dict:
    for j in pairwise_dict[i]:
#         result_dict[i][j] = 50 + log2(pairwise_dict[i][j] * 2 / (uni_dict[i] + uni_dict[j]))
        result_dict[i][j] = np.log2(1 + pairwise_dict[i][j] / uni_dict[i] * pairwise_dict[j][i] / uni_dict[j] * 1000000)

In [13]:
result_dict = dict(result_dict)

In [14]:
dct.sample()

Unnamed: 0,category1_id,category2_id,category3_id,category1_name,category2_name,category3_name,category
113,629,628,460,Рисование,Рисование,Китайская живопись У-Син,Для души


In [15]:
I = 460
print(dct[dct['category3_id'] == I].iloc[0]['category3_name'])
print('-' * 100)
for i, j in sorted(result_dict[I].items(), key=lambda x: x[1], reverse=True)[:25]:
    print(
        round(result_dict[I][i]), '\t\t',
        '::'.join(dct[dct['category3_id'] == i].iloc[0][['category', 'category1_name', 'category2_name', 'category3_name']].values)
    )

Китайская живопись У-Син
----------------------------------------------------------------------------------------------------
10 		 Для души::Рисование::Рисование::Различные техники рисования
8 		 Для души::Рисование::Рисование::ИЗО
8 		 Для души::Спецпроект / Интеллектуальный клуб::Интеллектуальный клуб. Творческие мастерские::Акварельная живопись
8 		 Для души::Рисование::Рисование::Рисование анти-стресс
8 		 Для души::Рисование::Рисование::Графика
7 		 Для души::Творчество::Художественно-прикладное творчество::Рукоделие и творчество
7 		 Для ума::Образование::История, искусство, краеведение::История искусства
7 		 Для ума::Образование::Пеший лекторий::Краеведение и онлайн-экскурсии
7 		 Для ума::Образование::Здорово жить::Здорово жить
7 		 Для ума::Спецпроект / Интеллектуальный клуб::Интеллектуальный клуб. Психологические тренинги::Психологические тренинги
7 		 Для тела::Физическая активность::Гимнастика::Гимнастика
6 		 Для души::Творчество::Красота и стиль::Мастер-класс по уходу з

In [16]:
result_flat = []

for i in result_dict:
    for j in result_dict[i]:
        result_flat.append((i, j, round(result_dict[i][j], 5)))

In [17]:
cat3_similarity = pd.DataFrame(result_flat, columns=['i', 'j', 'similarity'])#

In [18]:
cat3_similarity = cat3_similarity.sort_values(by=['i', 'similarity'], ascending=[True, False])

In [15]:
# n_categories = dct['category3_id'].nunique()

In [19]:
cat3_similarity['rank'] = cat3_similarity.groupby(['i'])['similarity'].transform('rank', ascending=False)

In [17]:
# df_i = dct[['category3_id']]
# df_i.columns = ['i']
# df_j = dct[['category3_id']]
# df_j.columns = ['j']

# cat_cross = df_i.merge(df_j, how='cross')
# cat_cross = cat_cross[cat_cross['i'] != cat_cross['j']]

In [18]:
# cat3_similarity = cat_cross.merge(cat3_similarity, how='left').fillna({'similarity': 0, 'rank': n_categories})

In [20]:
cat3_similarity = cat3_similarity.sort_values(by=['i', 'rank'], ascending=True)

In [21]:
cat3_similarity.to_sql('model_category_similarity', con=db)

## Возрастная статистика

In [15]:
user_age_group_raw = pd.read_sql("""
    SELECT 
        DISTINCT 
        users.user_id
        , round((julianday('2023-03-01') - julianday(date_of_birth)) / 365) as person_age
        , is_woman
        , category1_id
        , category2_id
        , category3_id
        , 1 as i
    FROM attend
    INNER JOIN users ON attend.user_id = users.user_id
    INNER JOIN groups ON attend.group_id = groups.group_id
    WHERE event_date < '2023-02-01'
""", con=db)
user_age_group_raw = user_age_group_raw.dropna().astype(int)
user_age_group_raw.shape

(263414, 7)

In [16]:
user_age_group = user_age_group_raw.pivot_table(index=['user_id', 'person_age', 'is_woman'], columns='category3_id', values='i').fillna(0).astype(int)
user_age_group = user_age_group.reset_index()
user_age_group['age_group'] = (user_age_group['person_age'] // 10) * 10

user_age_group_mean = user_age_group.drop(['user_id', 'person_age'], axis=1).groupby(['is_woman', 'age_group']).agg('mean')

names = {i: (i, c, x, y, z) for i, c, x, y, z in dct[['category3_id', 'category', 'category1_name', 'category2_name', 'category3_name']].drop_duplicates().sort_values(by='category3_id').values}

user_age_group_mean.columns = [names.get(i) for i in user_age_group_mean.columns]
user_age_group_mean = user_age_group_mean.T

user_age_group_mean = user_age_group_mean.reset_index().melt(id_vars=['index'])
user_age_group_mean['index'] = user_age_group_mean['index'].apply(lambda x: x[0])
user_age_group_mean.columns = ['category3_id', 'is_woman', 'age_group', 'age_cat_prob']

user_age_group_mean.head()

Unnamed: 0,category3_id,is_woman,age_group,age_cat_prob
0,102,0,40,0.0
1,104,0,40,0.285714
2,111,0,40,0.0
3,112,0,40,0.0
4,114,0,40,0.285714


In [17]:
user_age_group_mean = user_age_group_mean.sort_values(by='age_cat_prob', ascending=False)
user_age_group_mean['rank'] = user_age_group_mean.groupby(['age_group', 'is_woman'])['age_cat_prob'].transform('rank', ascending=False)

In [19]:
user_age_group_mean['rank'] = user_age_group_mean['rank'].astype(int)

In [20]:
user_age_group_mean[(user_age_group_mean['is_woman'] == 1) & (user_age_group_mean['age_group'] == 90)]

Unnamed: 0,category3_id,is_woman,age_group,age_cat_prob,rank
3220,139,1,90,0.170213,1
3216,131,1,90,0.138298,3
3223,142,1,90,0.138298,3
3440,1421,1,90,0.138298,3
3211,122,1,90,0.138298,3
...,...,...,...,...,...
3238,160,1,90,0.000000,187
3237,159,1,90,0.000000,187
3236,158,1,90,0.000000,187
3234,156,1,90,0.000000,187


In [22]:
user_age_group_mean.round(6).to_sql('model_age_category', con=db, if_exists='replace')

3492

## Соседи

In [4]:
neighbors = pd.read_sql("""
    SELECT group_id, postal_code, 1 as has_neighbors, count() - 1 as n_neighbors
    FROM (
        SELECT DISTINCT 
            group_id
            , users.postal_code
            , attend.user_id
        FROM attend
        LEFT JOIN users ON attend.user_id = users.user_id
        WHERE postal_code IS NOT NULL 
                AND event_date >= '2022-11-01'
                AND event_date < '2023-02-01'
    )
    GROUP BY group_id, postal_code
    HAVING n_neighbors > 1
""", con=db)
neighbors.shape

(18329, 4)

In [5]:
neighbors.to_sql('model_neighbors', con=db)

18329