In [1]:
# ! pip3 install --upgrade pandas

In [2]:
import sqlite3

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

tqdm.pandas()

In [3]:
db = sqlite3.connect('../raw_db.db')

## Группы в феврале

In [4]:
group_time_target = pd.read_sql("""
    SELECT 
        group_timetable.group_id 
        , weekday
        , CAST(start_time AS INT) as start_time
        , CAST(finish_time AS INT) as finish_time
        , CASE WHEN finish_date > '2023-02-28' AND start_date < '2023-02-01' 
            THEN julianday('2023-02-28') - julianday('2023-02-01')
            WHEN finish_date > '2023-02-28' THEN julianday('2023-02-28') - julianday(start_date)
            WHEN start_date < '2023-02-01' THEN julianday(finish_date) - julianday('2023-02-01')
            ELSE julianday(finish_date) - julianday(start_date)
            END as n_days
    FROM group_timetable
    WHERE start_date < '2023-02-28' AND finish_date > '2023-02-01'
""", con=db, dtype={
    'group_id': 'int32', 'weekday': 'int8', 'start_time': 'int8', 'finish_time': 'int8', 'n_days': 'int8'
})

In [5]:
def week_ones(d, s, f, n=1, padding=0):
    a = np.zeros((7, 17), dtype='int8')
    a[d - 1, max((s - 6 - padding), 0):(f - 6 + padding)] = n
    return a

In [6]:
group_time_target['timetable'] = group_time_target.apply(lambda x: week_ones(x['weekday'], x['start_time'], x['finish_time']), axis=1)

In [7]:
group_time_target = group_time_target.groupby('group_id', as_index=False).agg({'timetable': sum, 'n_days': 'min'})

In [8]:
group_time_target.head()

Unnamed: 0,group_id,timetable,n_days
0,801346553,"[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",27
1,801346572,"[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",27
2,801346579,"[[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",27
3,801346585,"[[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",13
4,801346609,"[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",27


In [9]:
group_place_target = pd.read_sql("""
    SELECT 
        group_id
        , district_id
        , zone_id
        , COALESCE(code, 0) as g_postal
    FROM group_locations
    WHERE group_id IN (
        SELECT DISTINCT group_id 
        FROM group_timetable
        WHERE start_date < '2023-02-28' AND finish_date > '2022-02-01'
    )
""", con=db, dtype={'group_id': 'int32', 'district_id': 'int8', 'zone_id': 'int8', 'g_postal': 'int32'})

In [10]:
group_place_target = group_place_target.groupby('group_id', as_index=False).agg({
    'district_id': tuple,
    'zone_id': tuple,
    'g_postal': lambda x: tuple(i for i in x if i > 0)
})

In [11]:
group_info = pd.read_sql("""
    SELECT 
        group_id
        , category3_id
        , category2_id
        , is_online
        , is_mobile
    FROM groups
    WHERE group_id IN (
        SELECT DISTINCT group_id 
        FROM group_timetable
        WHERE start_date < '2023-02-28' AND finish_date > '2023-02-01'
    )
""", con=db, dtype={'group_id': 'int32', 'category3_id': 'int16', 'category2_id': 'int16', 'is_online': 'int8', 'is_mobile': 'int8'})

In [12]:
group_target = group_time_target.merge(group_place_target).merge(group_info)

In [13]:
group_target.head()

Unnamed: 0,group_id,timetable,n_days,district_id,zone_id,g_postal,category3_id,category2_id,is_online,is_mobile
0,801346553,"[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",27,"(5,)","(46,)","(105122,)",104,591,0,0
1,801346579,"[[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",27,"(5,)","(16,)","(107065,)",141,609,0,0
2,801346585,"[[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",13,"(5,)","(23,)","(111141,)",136,630,0,0
3,801346609,"[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",27,"(1, 1)","(62, 2)","(109125, 109341)",178,596,0,0
4,801346613,"[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",27,"(1, 1)","(62, 2)","(109125, 109341)",123,626,0,0


In [14]:
postat_to_zd = pd.read_sql("""
    SELECT district_id, zone_id, code, count(group_id) as n
    FROM group_locations
    WHERE code IS NOT NULL
    GROUP BY district_id, zone_id, code
    ORDER BY n DESC
""", con=db, dtype={'district_id': 'int16', 'zone_id': 'int16', 'code': 'int32', 'n': 'int16'}).drop_duplicates(subset=['code'], keep='first')

code_dict = {c: (z, d) for d, z, c, _ in postat_to_zd.values}

In [15]:
group_target_real = pd.read_sql("""
    SELECT 
        attend.user_id
        , group_id
        , count(attend.user_id) as n
        , 1 as visited
        , COALESCE(postal_code, 0) as postal_code
    FROM attend
    LEFT JOIN users ON attend.user_id = users.user_id
    WHERE event_date >= '2023-02-01'
    GROUP BY attend.user_id, group_id
""", con=db, dtype={'user_id': 'int32', 'group_id': 'int32', 'n': 'int8', 'visited': 'int8', 'postal_code': 'int32'})

In [16]:
group_target_real = group_target_real.merge(group_target[['category2_id', 'category3_id', 'group_id']])

In [17]:
group_target_real.columns = ['user_id', 'group_id', 'n_visits', 'visited', 'u_postal', 'category2_id', 'category3_id']

In [18]:
group_target_real['u_district'] = group_target_real['u_postal'].apply(lambda x: code_dict.get(x, [0, 0])[1]).astype('int16')
group_target_real['u_zone'] = group_target_real['u_postal'].apply(lambda x: code_dict.get(x, [0, 0])[0]).astype('int16')

In [19]:
user_flat = group_target_real[['user_id', 'category3_id', 'u_postal', 'u_district', 'u_zone']].drop_duplicates()

In [20]:
all_combinations = group_target.merge(user_flat, on=['category3_id'])

In [21]:
geo_sim = all_combinations[['district_id', 'zone_id', 'g_postal', 'u_postal', 'u_district', 'u_zone']].drop_duplicates()

In [22]:
geo_sim['same_post'] = geo_sim[['u_postal', 'g_postal']].progress_apply(lambda x: x['u_postal'] in x['g_postal'], axis=1).astype('int8')
geo_sim['same_zone'] = geo_sim.progress_apply(lambda x: x['u_zone'] in x['zone_id'], axis=1).astype(int).astype('int8')
geo_sim['same_district'] = geo_sim.progress_apply(lambda x: x['u_district'] in x['district_id'], axis=1).astype(int).astype('int8')

  0%|          | 0/267031 [00:00<?, ?it/s]

  0%|          | 0/267031 [00:00<?, ?it/s]

  0%|          | 0/267031 [00:00<?, ?it/s]

In [23]:
# geo_sim['g_postal'].unique()

In [24]:
neighbors = pd.read_sql("""
    SELECT group_id, postal_code, n_neighbors
    FROM model_neighbors
""", con=db, dtype={'group_id': 'int32', 'postal_code': 'int32', 'n_neighbors': 'int16'})

In [25]:
all_combinations = all_combinations.merge(
    geo_sim, how='left'
).merge(
    neighbors, left_on=['group_id', 'u_postal'], right_on=['group_id', 'postal_code'], how='left'
)

In [26]:
all_combinations['n_neighbors'] = all_combinations['n_neighbors'].fillna(0).astype('int8')

In [27]:
all_combinations['same_district'].sum()

1273064

In [28]:
all_combinations.drop(['u_postal', 'u_district', 'u_zone', 'g_postal', 'zone_id', 'district_id', 'postal_code'], axis=1, inplace=True)

In [29]:
all_combinations.head()

Unnamed: 0,group_id,timetable,n_days,category3_id,category2_id,is_online,is_mobile,user_id,same_post,same_zone,same_district,n_neighbors
0,801346553,"[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",27,104,591,0,0,101346666,0,0,0,0
1,801346553,"[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",27,104,591,0,0,101355093,0,0,0,0
2,801346553,"[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",27,104,591,0,0,101358634,0,0,0,0
3,801346553,"[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",27,104,591,0,0,101358797,0,0,0,0
4,801346553,"[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",27,104,591,0,0,101363476,0,0,0,0


In [30]:
group_target_real = all_combinations.merge(
    group_target_real[['user_id', 'group_id', 'n_visits', 'visited']], 
    on=['user_id', 'group_id'], how='left').fillna(0)

In [31]:
group_target_real.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11983434 entries, 0 to 11983433
Data columns (total 14 columns):
 #   Column         Dtype  
---  ------         -----  
 0   group_id       int32  
 1   timetable      object 
 2   n_days         int8   
 3   category3_id   int16  
 4   category2_id   int16  
 5   is_online      int8   
 6   is_mobile      int8   
 7   user_id        int32  
 8   same_post      int8   
 9   same_zone      int8   
 10  same_district  int8   
 11  n_neighbors    int8   
 12  n_visits       float64
 13  visited        float64
dtypes: float64(2), int16(2), int32(2), int8(7), object(1)
memory usage: 491.4+ MB


In [32]:
group_target_real['n_visits'] = group_target_real['n_visits'].astype('int8')
group_target_real['visited'] = group_target_real['visited'].astype('int8')

In [33]:
len(group_target_real['user_id'].unique())

30159

In [34]:
del all_combinations

## Данные по пользователям ранее

In [35]:
visited_raw = pd.read_sql("""
    SELECT 
        user_id
        , category2_id
        , group_id
        , weekday
        , lower_bound
        , upper_bound
        , count(user_id) as n
        , is_online
    FROM (
        SELECT 
            attend.group_id
            , user_id
            , strftime('%w', event_date) + 1 as weekday
            , CAST(event_start AS INT) as lower_bound
            , CAST(event_end AS INT) as upper_bound
            , attend.is_online
            , category2_id
        FROM attend
        LEFT JOIN groups ON attend.group_id = groups.group_id
        WHERE event_date < '2023-02-01' AND event_date >= '2023-01-01' AND category2_id IS NOT NULL
    )
    GROUP BY user_id, category2_id, group_id, weekday, lower_bound, upper_bound, is_online
""", con=db, dtype={
    'user_id': 'int32', 'category2_id': 'int16', 'weekday': 'int8', 
    'lower_bound': 'int8', 'upper_bound': 'int8',
    'n': 'int8'
})

In [36]:
visited_raw['u_timetable'] = visited_raw.apply(
    lambda x: week_ones(x['weekday'], x['lower_bound'], x['upper_bound'], x['n'], padding=0)
, axis=1)

In [37]:
visited_raw['u_timetable_time'] = visited_raw.apply(
    lambda x: week_ones(x['weekday'], x['lower_bound'], x['upper_bound'], x['n'], padding=2).sum(axis=0)
, axis=1)

In [38]:
week_ones(1, 9, 12, 2, padding=2).sum(axis=0)

array([0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [39]:
user_real_visits = visited_raw.groupby(['user_id', 'group_id'], as_index=False).agg({'n': 'sum'})
user_real_visits.columns = ['user_id', 'group_id', 'target_n_visited']

In [40]:
from scipy.special import softmax

In [41]:
user_visit_time_general = visited_raw.groupby('user_id', as_index=False).agg({'u_timetable_time': sum})
# user_visit_time_general['u_timetable_time'] = user_visit_time_general['u_timetable_time'].apply(lambda x: np.clip(x, 0, 1))
user_visit_time_general['u_timetable_time'] = user_visit_time_general['u_timetable_time'].apply(lambda x: (softmax(x) / max(softmax(x))).round(2))

In [42]:
user_visit_time = visited_raw.groupby('user_id', as_index=False).agg({'u_timetable': sum})

In [43]:
user_visit_time.loc[20]['u_timetable']

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int8)

In [44]:
user_online = visited_raw.groupby('user_id', as_index=False).agg({'n': 'sum', 'is_online': 'sum'})

In [45]:
user_online['is_offline'] = user_online['n'] - user_online['is_online']

In [46]:
user_visit_time_cat = visited_raw.groupby(['user_id', 'category2_id'], as_index=False).agg({'u_timetable': sum})
user_visit_time_cat.columns = ['user_id', 'category2_id', 'u_cat_timetable']

In [47]:
user_online.head()

Unnamed: 0,user_id,n,is_online,is_offline
0,101346549,5,4,1
1,101346552,38,14,24
2,101346559,17,7,10
3,101346562,12,0,12
4,101346566,6,0,6


In [48]:
user_online.columns = ['user_id', 'n_visits', 'n_online', 'n_offline']

In [49]:
user_online[['n_online', 'n_offline']] = softmax(np.log(user_online[['n_online', 'n_offline']] + 1), axis=1)

In [50]:
user_online.head()

Unnamed: 0,user_id,n_visits,n_online,n_offline
0,101346549,5,0.714286,0.285714
1,101346552,38,0.375,0.625
2,101346559,17,0.421053,0.578947
3,101346562,12,0.071429,0.928571
4,101346566,6,0.125,0.875


In [51]:
user_online = user_online[['user_id', 'n_online', 'n_offline']]
user_online.columns = ['user_id', 'u_online', 'u_offline']

In [52]:
total = group_target_real.merge(
    user_visit_time, on='user_id'
).merge(
    user_online, on='user_id'
).merge(
    user_real_visits, on=['group_id', 'user_id'], how='left'
).merge(
    user_visit_time_cat, on=['category2_id', 'user_id'], how='left'
).merge(
    user_visit_time_general, on='user_id'
)

In [53]:
total['n_visits'] = total['n_visits'].fillna(0).astype('int8')
total['n_neighbors'] = total['n_neighbors'].astype('int16')
total = total[total['n_visits'] == 0]

total = total.merge(
    total[total['target_n_visited'] > 0][['user_id', 'category3_id']].drop_duplicates()
)

In [54]:
total.shape

(512702, 20)

In [55]:
total['u_cat_timetable'] = total['u_cat_timetable'].apply(lambda x: np.zeros((7, 17), dtype='int8') if type(x) == float else x)

In [56]:
total['is_offline'] = 1 - total['is_online'] 

In [57]:
total['u_online'] = total['u_online'].astype('float16')
total['u_offline'] = total['u_offline'].astype('float16')

In [58]:
total['unmatch_other'] = total.progress_apply(
    lambda x: 1 - ((1 - np.clip((x['u_timetable'] - x['u_cat_timetable']) / 4, 0, 1) ) * x['timetable']).sum() / x['timetable'].sum(),
    axis=1
)

  0%|          | 0/512702 [00:00<?, ?it/s]

In [59]:
total['daytime_match'] = total.progress_apply(
    lambda x: (x['u_timetable_time'] * x['timetable']).sum() / x['timetable'].sum(),
    axis=1
)

  0%|          | 0/512702 [00:00<?, ?it/s]

In [60]:
total['match_same'] = total.progress_apply(
    lambda x: 1 - ((1 - np.clip(x['u_cat_timetable'] / 4, 0, 1)) * x['timetable']).sum() / x['timetable'].sum(),
    axis=1
)

  0%|          | 0/512702 [00:00<?, ?it/s]

In [61]:
total['target_n_visited'] = total['target_n_visited'].fillna(0).astype('int16')
total['unmatch_other'] = total['unmatch_other'].astype('float32')
total['match_same'] = total['match_same'].astype('float32')

In [62]:
(total[total['daytime_match'] == 0]['target_n_visited'] > 0).mean()

0.003777833029449141

In [63]:
(total[total['daytime_match'] > 0]['target_n_visited'] > 0).mean()

0.0061998858416460545

In [64]:
total.drop(['u_timetable', 'u_cat_timetable', 'timetable', 'u_timetable_time'], axis=1, inplace=True)

In [65]:
total.columns

Index(['group_id', 'n_days', 'category3_id', 'category2_id', 'is_online',
       'is_mobile', 'user_id', 'same_post', 'same_zone', 'same_district',
       'n_neighbors', 'n_visits', 'visited', 'u_online', 'u_offline',
       'target_n_visited', 'is_offline', 'unmatch_other', 'daytime_match',
       'match_same'],
      dtype='object')

In [66]:
total.columns = ['group_id', 'n_days', 'category3_id', 'category2_id', 'g_online', 'g_mobile',
       'user_id', 'same_post', 'same_zone', 'same_district', 'n_neighbors',
       'n_visits', 'visited', 'u_online', 'u_offline',
       'target_n_visited', 'g_offline', 'unmatch_other', 'daytime_match', 'match_same']

In [67]:
total['x_online'] = (total['g_online'] * total['u_online']).round(4).astype('float16')
total['x_offline'] = (total['g_offline'] * total['u_offline']).round(4).astype('float16')

In [68]:
total.columns

Index(['group_id', 'n_days', 'category3_id', 'category2_id', 'g_online',
       'g_mobile', 'user_id', 'same_post', 'same_zone', 'same_district',
       'n_neighbors', 'n_visits', 'visited', 'u_online', 'u_offline',
       'target_n_visited', 'g_offline', 'unmatch_other', 'daytime_match',
       'match_same', 'x_online', 'x_offline'],
      dtype='object')

In [69]:
total = total[[
        'group_id', 'user_id', 'category3_id',
        'x_online', 'x_offline', 
        'same_post', 'same_zone', 'same_district',
        'unmatch_other', 
        'match_same', 
        'daytime_match',
        'n_neighbors', 
        'n_days',
        'target_n_visited'
]]

In [70]:
total.head()

Unnamed: 0,group_id,user_id,category3_id,x_online,x_offline,same_post,same_zone,same_district,unmatch_other,match_same,daytime_match,n_neighbors,n_days,target_n_visited
0,801346553,101346666,104,0.0,0.856934,0,0,0,0.0,0.0,1.0,0,27,0
1,801346676,101346666,104,0.0,0.856934,0,0,0,0.0,0.0,1.0,0,27,0
2,801346713,101346666,104,0.0,0.856934,0,0,0,0.0,0.0,1.0,0,27,0
3,801346765,101346666,104,0.0,0.856934,0,0,0,0.0,0.0,1.0,0,27,0
4,801346817,101346666,104,0.0,0.856934,0,0,0,0.0,0.0,1.0,0,27,0


In [71]:
total['has_neighbors'] = (total['n_neighbors'] > 0).astype('int8')
total['log_neighbors'] = np.log(total['n_neighbors'] + 1).astype('float32')
total['log_days'] = np.log(total['n_days'] + 1).astype('float32')

In [99]:
cols = [
#        'x_online', 'x_offline', 
        'same_post', 'same_zone', 'same_district',
        'unmatch_other', 'match_same', 'daytime_match',
        'has_neighbors', 'log_days'
    ]

In [100]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [101]:
users_train, users_test = train_test_split(total['user_id'].unique(), test_size=0.2)

In [102]:
i_train = total['user_id'].isin(set(users_train))
i_test = total['user_id'].isin(set(users_test))

X_train = total[i_train].set_index(['user_id', 'category3_id'])[cols]
X_test = total[i_test].set_index(['user_id', 'category3_id'])[cols]
y_train = (total[i_train].set_index(['user_id', 'category3_id'])['target_n_visited'] > 0).astype('int8')
y_test = (total[i_test].set_index(['user_id', 'category3_id'])['target_n_visited'] > 0).astype('int8')

In [103]:
model = LogisticRegression(class_weight='balanced')

model.fit(X_train, y_train)

In [104]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      0.88      0.94    105896
           1       0.03      0.61      0.05       556

    accuracy                           0.88    106452
   macro avg       0.51      0.75      0.49    106452
weighted avg       0.99      0.88      0.93    106452



In [105]:
print(model.intercept_)
for n, c in zip(X_train.columns, model.coef_[0]):
    print(c, n)

[0.00599535]
0.51795792874992 same_post
1.9495522695816145 same_zone
1.1933673301347765 same_district
0.4452992814512414 unmatch_other
0.6046535395812348 match_same
0.4433465530408858 daytime_match
2.8217687930726325 has_neighbors
-0.29277446050774936 log_days


In [106]:
x_see = X_test.copy()
x_see['n_occurred'] = total[i_test]['target_n_visited'].values
x_see['has_occurred'] = (total[i_test]['target_n_visited'] > 0).astype('int8').values
x_see['group_id'] = total[i_test]['group_id'].values
x_see['predict'] = model.predict_proba(X_test)[:, 1].round(8)
x_see['predict'] = x_see['predict']#.astype('float16')
# x_see['has_neighbors'] = (x_see['n_neighbors'] > 0).astype('int8')
x_see = x_see.reset_index()

x_see = x_see.sort_values(
#     by='predict'
#     , ascending=False
    by=['predict', 'same_district', 'same_zone', 'has_neighbors', 'same_post', 'group_id'] #, 'match_same'
    , ascending=[False, False, False, False, False, False]
).groupby(
    ['user_id', 'category3_id'], as_index=False
).agg({
    'predict': list,
    'group_id': list,
    'has_occurred': list,
    'n_occurred': list,
    'same_post': list,
    'same_zone': list,
    'same_district': list,
    'has_neighbors': list,
#     'unmatch_other': list, 
#     'daytime_match': list, 
    'log_days': list, 
#     'match_same': list
})

In [107]:
x_see['category_real'] = x_see.apply(
    lambda x: [i[1] for i in sorted(zip(x['n_occurred'], x['group_id'])) if i[0] > 0],
    axis=1
)

In [108]:
x_see = x_see[x_see['category_real'].apply(len) > 0]

In [109]:
x_see['candidates'] = x_see.apply(
    lambda x: x['group_id'][
        :min((np.argmax(
            np.array(x['predict'][:-1]) - np.array(x['predict'][1:])
        ) if len(x['predict']) > 1 else 0) + 1, 10)
    ]
    , axis=1
)

In [110]:
x_see['share_uq'] = x_see.apply(
    lambda x: max([1 if real in x['group_id'][:1] else 0 for real in x['category_real']])
    , axis=1
)
x_see['share_uq'].mean()

0.27848101265822783

In [111]:
x_see['idx'] = x_see.apply(
    lambda x: np.min([x['candidates'].index(real) if real in x['candidates'] else -1 for real in x['category_real'] ])
    , axis=1
)
x_see['mx_idx'] = x_see['candidates'].apply(len)
# x_see['share_uq'].mean()

In [None]:
# x_see[['mx']]

In [112]:
x_see['share_uq'] = x_see.apply(
    lambda x: np.max([1 if real in x['candidates'] else 0 for real in x['category_real']])
    , axis=1
)
x_see['share_uq'].mean()

0.5379746835443038

In [113]:
x_see[x_see['idx'] > 1].sort_values(by='idx')[['predict', 'candidates', 'category_real', 'same_district', 'same_zone', 'same_post', 'log_days', 'idx', 'mx_idx']]

Unnamed: 0,predict,candidates,category_real,same_district,same_zone,same_post,log_days,idx,mx_idx
8,"[0.95233122, 0.94350804, 0.93265691, 0.9168369...","[801370019, 801362301, 801351963, 801371348, 8...",[801351963],"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3.33203125, 3.33203125, 3.33203125, 3.3320312...",2,7
127,"[0.99741761, 0.99741761, 0.99600035, 0.9959826...","[801366968, 801363269, 801363258, 801366073, 8...",[801363258],"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[3.33203125, 3.33203125, 3.33203125, 3.3320312...",2,10
158,"[0.93193043, 0.90456105, 0.89783355, 0.8978335...","[801370497, 801362749, 801368280, 801361014]",[801368280],"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3.33203125, 3.33203125, 3.33203125, 3.3320312...",2,4
160,"[0.99327483, 0.94404603, 0.94091586, 0.9030108...","[801357591, 801367963, 801357532, 801368880]",[801357532],"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3.33203125, 3.134765625, 3.33203125, 3.134765...",2,4
162,"[0.997932, 0.95720787, 0.9569447, 0.67371991, ...","[801373757, 801362935, 801362933]",[801362933],"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3.33203125, 3.33203125, 3.33203125, 2.6386718...",2,3
...,...,...,...,...,...,...,...,...,...
324,"[0.94295607, 0.93866181, 0.93193043, 0.9319304...","[801363083, 801369933, 801368424, 801365953, 8...",[801367672],"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.9462890625, 3.33203125, 3.33203125, 3.33203...",8,10
196,"[0.99741761, 0.99682327, 0.99658837, 0.9960179...","[801348391, 801348201, 801348499, 801349180, 8...",[801348572],"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3.33203125, 3.33203125, 3.33203125, 3.3320312...",9,10
261,"[0.65009871, 0.64853242, 0.63446248, 0.6198462...","[801368863, 801364159, 801367390, 801364867, 8...",[801360207],"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2.708984375, 2.564453125, 2.7734375, 2.638671...",9,10
259,"[0.47935262, 0.47139179, 0.4616926, 0.45199429...","[801369204, 801373473, 801369125, 801373454, 8...",[801366233],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2.197265625, 2.564453125, 3.091796875, 2.7089...",9,10


In [None]:
x_see[x_see['idx'] == 1].sort_values(by='mx_idx')#.head(50)

In [None]:
plt.figure(figsize=(20, 10))
sns.scatterplot(x='mx_idx', y='idx', data=x_see, alpha=0.02)
# plt.xlim((-2, 11))
# plt.ylim((-2, 10))
plt.show()

In [None]:
# x_see['mx_idx'].hist(bins=100)

In [None]:
# plt.figure(figsize=(20, 10))
# sns.scatterplot(x='mx_idx', y='idx', data=x_see, alpha=0.02)
# plt.show()
x_see[x_see['idx'] < 50]['idx'].hist(bins=100)

In [None]:
group_target[group_target['group_id'].isin({801349222, 801366063})]

In [None]:
# x_see[x_see['share_uq'] > 0]

In [None]:
# x_see.info(memory_usage = "deep")