In [1]:
import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm
from collections import Counter
from nltk.tokenize import wordpunct_tokenize
import re
from string import punctuation

In [4]:
attend = pd.read_csv('../data/attend.csv')
groups = pd.read_csv('../data/groups.csv')
test = pd.read_csv('../data/test.csv')
users = pd.read_csv('../data/users.csv')

dct = pd.read_csv('../data/dict.csv')

## Юзеры

In [5]:
postal_codes = pd.read_csv('postal_codes.csv')

atypes = {
    'парк', 'улица', 'пл', 'ул', 'шоссе', 'км', 'пр', 'кт', 'мжд'
}

postal_codes['splitted'] = postal_codes['street'].apply(
    lambda x: tuple(
        sorted(
            i.lower().replace('ё', 'е') 
            for i in wordpunct_tokenize(x) 
            if len(i) > 1 and i[0] not in '0123456789' and i.lower() not in atypes
        )
    )
)

grouped = postal_codes.groupby(['splitted', 'postal_code']).agg(lambda x: list(x)[0])

postal_codes = grouped.reset_index()


to_skip = {
    'авиаконструктора', 'адмирала', 'академика', 'архитектора', 'братьев', 'ветеранов', 'генерала',
    'героев', 'защитников', 'космонавта', 'летчика', 'маршала', 
    'воскресенско', 'москва', 'мжд'
}

names = {
    'александра', 'алексея', 'бориса', 'василия', 'дмитрия', 'ивана', 'константина',
     'льва', 'марии', 'михаила', 'наташи', 'николая', 'павла', 'петра', 'сергея', 'степана', 'федора',
}

postal_codes['head'] = [
    None if len(j) == 0 else j[0]
    for j in 
    [
        [i for i in splitted if len(i) >= 4 and i not in to_skip and i not in names and i not in atypes]
        if splitted and max(len(i) for i in splitted) > 3
        else []
        for code, splitted in postal_codes[['postal_code', 'splitted']].values
    ]
]

grouped = postal_codes.groupby('head').agg(list)
grouped = grouped[grouped['postal_code'].apply(lambda x: len(set(x))) > 1]
not_ok = set(grouped.index)

In [6]:
simple_dict = {
    head: (code, splitted)
    for code, splitted, head in postal_codes[['postal_code', 'splitted', 'head']].values
    if splitted and head not in not_ok
}

simple_dict['ленинский']

(117198, ('ленинский',))

In [7]:
users['code'] = users['адрес проживания'].apply(wordpunct_tokenize).apply(
    lambda x: [
        simple_dict[i.lower().strip().replace('ё', 'е')]  
        for i in x
        if len(x) > 1 and i.lower().strip().replace('ё', 'е') in simple_dict
    ]
    if type(x) == list else []
)

In [9]:
users.head(1)

Unnamed: 0,уникальный номер,дата создание личного дела,пол,дата рождения,адрес проживания,code
0,101391104,2019-02-26 15:52:09.000,Женщина,1959-09-10,"город москва, константинова, дом 30","[(129164, (константинова,))]"


In [10]:
manual = {
    'варшавское шоссе': 115127, 
    'ярославское шоссе': 129626,
    'ореховый бульвар': 115551,
    'боровское шоссе': 108810,
    'проспект мира': 127083, 
    'бескудниковский бульвар': 127247, 
    'кутузовский проспект': 121108, 
    'измайловский бульвар': 105043
}

In [11]:
users['code'] = users['адрес проживания'].apply(wordpunct_tokenize).apply(
    lambda x: [
        simple_dict[i.lower().strip().replace('ё', 'е')]  
        for i in x
        if len(x) > 1 and i.lower().strip().replace('ё', 'е') in simple_dict
    ]
    if type(x) == list else []
).apply(lambda x: x[0][0] if len(x) > 0 else None)

In [12]:
users_new = pd.DataFrame({
    'user_id': users['уникальный номер'],
    'created_at': pd.to_datetime(users['дата создание личного дела']),
    'is_woman': (users['пол'] == 'Женщина').astype(int),
    'date_of_birth': pd.to_datetime(users['дата рождения']),
    'postal_code': users['code'].fillna(0).astype(int)
})

In [13]:
users_new.to_csv('../data/new_users.csv', index=None)

## Посещения

In [14]:
attend_new = pd.DataFrame({
    'event_id': attend['уникальный номер занятия'],
    'group_id': attend['уникальный номер группы'],
    'user_id': attend['уникальный номер участника'],
    'event_date': attend['дата занятия'],
    'event_start': attend['время начала занятия'].apply(lambda x: int(x[:2]) + int(x[3:5]) / 60),
    'event_end': attend['время окончания занятия'].apply(lambda x: int(x[:2]) + int(x[3:5]) / 60),
    'is_online': (attend['онлайн/офлайн'] == 'Да').astype(int)
})

attend_new.to_csv('new_attend.csv', index=None)

## Группы

### Расписание

In [15]:
reg_dates = '([0-9]{2}\.[0-9]{2}\.[0-9]{4})'
reg_time = '[0-9]{2}:[0-9]{2}-[0-9]{2}:[0-9]{2}'

DAYS = {'Пн': 1, 'Вт': 2, 'Ср':3 , 'Чт': 4, 'Пт': 5, 'Сб': 6, 'Вс': 7}

def parse_date_string(x):
    if len(re.findall(reg_dates, x)) > 2:
        return [
            parse_date_string(y)[0]
            for y in x.split(';')
        ]
    
    start, finish = tuple(re.findall(reg_dates, x))
    start = start[6:] + '-' + start[3:5] + '-' + start[:2]
    finish = finish[6:] + '-' + finish[3:5] + '-' + finish[:2]
    
    x = x.split(finish)[-1].strip()
    
    timetable = []
    days = []
    for item in x.split():
        item = item.strip(punctuation)
        if item in {'Пн', 'Вт', 'Ср', 'Чт', 'Пт', 'Сб', 'Вс'}:
            days.append(item)
        elif re.match(reg_time, item):
            timetable.extend([(DAYS[day], int(item[:2]) + int(item[3:5]) / 60, int(item[6:8]) + int(item[9:11]) / 60) for day in days])
            days = []
    
    return [(start, finish, timetable)]

In [16]:
groups['timetable_closed'] = groups['расписание в закрытых периодах'].apply(lambda x: parse_date_string(x) if type(x) == str else [])
groups['timetable_open'] = groups['расписание в активных периодах'].apply(lambda x: parse_date_string(x) if type(x) == str else [])
groups['timetable_plan'] = groups['расписание в плановом периоде'].apply(lambda x: parse_date_string(x) if type(x) == str else [])

### Локации

In [17]:
s_types = {
    'улица', 'шоссе', 'переулок', 'проспект', 'проезд', 
    'бульвар', 'переулок', 'набережная', 'парк',
    'тупик', 'ул', 'аллея', 'линия', 'просек'
#     , 'площадь'
}

def split_locations(x):
    address = x['адрес площадки']
    district = x['округ площадки']
    zone = x['район площадки']
    
    if 'онлайн' in x['направление 3'].lower():
        return -1, address, district, zone, []
    elif type(district) == float:
        return -3, address, district, zone, []
    
    if ',' in district:
        status = 2
        district = [i.strip() for i in district.split(',')]
        zone = [i.strip() for i in zone.split(',')]
    else:
        status = 1
        district = [district]
        zone = [zone]
    
    address_new = []
    city = None
    for item in address.split(','):
        item = item.lower().strip().replace('.', '')
#         print(item)
        if len(set([i for i in item.split()]) & s_types) > 0 and city is not None:
#             print('YES', item)
            address_new.append((city, item.strip()))
            city = None
            continue
        elif 'моск' in item:
            city = 'москва'
            continue
        elif 'город' in item or 'г.' in item or 'посел' in item or 'край' in item or 'республика' in item:
            city = item.strip()
            continue
            
    if status == 1:
        address_new = address_new[:1]
        
#         else:
#             print('NO', item)
#     print(len(district), len(zone), len(address_new))
    return status, address, district, zone, address_new

In [18]:
groups['location'] = groups[['направление 3', 'адрес площадки', 'округ площадки', 'район площадки']].apply(split_locations, axis=1)

groups['is_online'] = groups['location'].apply(lambda x: x[0] == -1).astype(int)
groups['is_mobile'] = groups['location'].apply(lambda x: x[0] == -3).astype(int)

groups['loc_type'] = groups['location'].apply(lambda x: x[0])
groups['match_loc'] = groups['location'].apply(lambda x: len(x[2]) == len(x[4]) if type(x[2]) == list and type(x[4]) == list else -1).astype(float)

In [19]:
groups.groupby(['loc_type', 'is_online', 'is_mobile', 'match_loc']).agg({'is_online': 'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,is_online
loc_type,is_online,is_mobile,match_loc,Unnamed: 4_level_1
-3,0,1,-1.0,119
-1,1,0,-1.0,7497
1,0,0,0.0,575
1,0,0,1.0,16967
2,0,0,0.0,137
2,0,0,1.0,1708


In [20]:
def location_code(x):
    if x[0] < 0:
        return None
    result = []
    if type(x[2]) == list and type(x[4]) == list and len(x[2]) == len(x[4]):
        for c, loc in x[4]:
            loc = loc.lower().strip().replace('ё', 'е')
            loc_match = [i for i in loc.split() if i in simple_dict]
            if loc_match:
                result.append(simple_dict[loc_match[0]][0])
            elif loc in manual:
                result.append(manual[loc])
            else:
                result.append(None)
    return result

In [21]:
groups['codes'] = groups['location'].apply(location_code)

In [22]:
groups['districts'] = groups['location'].apply(lambda x: x[2])
groups['zones'] = groups['location'].apply(lambda x: x[3])

### Направления

In [23]:
dct.head()

Unnamed: 0.1,Unnamed: 0,cat1_id,cat2_id,cat3_id,cat1_name,cat2_name,cat3_name,branch
0,0,649,651,1043,Игры,Интеллектуальные игры,Брейн-ринг,Для ума
1,1,649,651,1040,Игры,Интеллектуальные игры,Викторины,Для ума
2,2,649,651,1042,Игры,Интеллектуальные игры,Иные интеллектуальные игры,Для ума
3,3,649,651,1041,Игры,Интеллектуальные игры,Квест,Для ума
4,4,649,650,516,Игры,Настольные игры,Иные настольные игры,Для ума


In [24]:
cat1_dict = dict(dct[['cat1_name', 'cat1_id']].values)
cat2_dict = dict(dct[['cat2_name', 'cat2_id']].values)
cat3_dict = dict(dct[['cat3_name', 'cat3_id']].values)

In [25]:
def remove_online(x):
    if 'ОНЛАЙН' in x:
        return x.replace('ОНЛАЙН', '').strip()
    else:
        return x

In [31]:
groups_new = pd.DataFrame({
    'group_id': groups['уникальный номер'],
    'address': groups['адрес площадки'],
    'category1_id': groups['направление 1'].apply(lambda x: cat1_dict.get(remove_online(x))),
    'category2_id': groups['направление 2'].apply(lambda x: cat2_dict.get(remove_online(x))),
    'category3_id': groups['направление 3'].apply(lambda x: cat3_dict.get(remove_online(x))),
    'is_online': groups['is_online'],
    'is_mobile': groups['is_mobile'],
#     'is_available': groups['is_available'],
    'match_loc': (groups['match_loc'] != 0).astype(int),
}).dropna()

In [33]:
groups_new.to_csv('new_groups.csv', index=None)

### Итог по локациям

In [34]:
district_cnt = Counter()
for row in groups['districts'].values:
    if type(row) == list:
        for x in row:
            district_cnt[x] += 1
            
district_dict = {x: i + 1 for i, x in enumerate(district_cnt.keys())}
# print(district_dict)

zone_cnt = Counter()
for row in groups['zones'].values:
    if type(row) == list:
        for x in row:
            zone_cnt[x] += 1
            
zone_dict = {x: i + 1 for i, x in enumerate(zone_cnt.keys())}
# pd.DataFrame(zone_dict.items(), columns=['name', 'zone_id']).to_csv('zones.csv', index=None)


In [35]:
result = []
loc_id = 1
for i in tqdm(groups.index):
    row = groups.loc[i]
    if row['is_mobile'] == 0 and row['is_online'] == 0 and row['match_loc'] == 1:
        x = row['location']
        items = list(set(
            (row['уникальный номер'], district_dict[x[2][i]], zone_dict[x[3][i]], row['codes'][i])
            for i in range(len(x[2]))
        ))
        for i in items:
            result.append((loc_id, *i))
            loc_id += 1

  0%|          | 0/27003 [00:00<?, ?it/s]

In [36]:
group_locations = pd.DataFrame(result, columns=['location_group_id', 'group_id', 'district_id', 'zone_id', 'code'])

In [37]:
group_locations.to_csv('group_locations.csv', index=None)

### Итог по расписанию

In [38]:
result = []

for i in tqdm(groups.index):
    row = groups.loc[i]
    if len(row['timetable_plan']) > 0:
        for item in row['timetable_plan']:
            for x in item[2]:
                result.append((row['уникальный номер'], item[0], item[1], x[0], x[1], x[2], 3))
    if len(row['timetable_closed']) > 0:
        for item in row['timetable_closed']:
            for x in item[2]:
                result.append((row['уникальный номер'], item[0], item[1], x[0], x[1], x[2], 1))
                
    if len(row['timetable_open']) > 0:
        for item in row['timetable_open']:
            for x in item[2]:
                result.append((row['уникальный номер'], item[0], item[1], x[0], x[1], x[2], 2))

  0%|          | 0/27003 [00:00<?, ?it/s]

In [39]:
timetable = pd.DataFrame(result, columns=['group_id', 'start_date', 'finish_date', 'weekday', 'start_time', 'finish_time', 'status_code'])

timetable['event_length'] = (timetable['finish_time'] - timetable['start_time']).round(2)

In [40]:
timetable.to_csv('timetable.csv', index=None)

In [41]:
pd.DataFrame(district_dict.items(), columns=['name', 'district_id']).to_csv('districts.csv', index=None)

In [42]:
pd.DataFrame(zone_dict.items(), columns=['name', 'zone_id']).to_csv('zones.csv', index=None)

# Запись в базу

In [43]:
import sqlite3

import pandas as pd

In [44]:
db = sqlite3.connect('../raw_db.db')
cursor = db.cursor()

In [45]:
districts = pd.read_csv('districts.csv')
zones = pd.read_csv('zones.csv')

group_locations = pd.read_csv('group_locations.csv')
group_locations['code'] = group_locations['code'].fillna(0).astype(int)

groups = pd.read_csv('new_groups.csv')
groups['category1_id'] = groups['category1_id'].fillna(0).astype(int)
groups['category2_id'] = groups['category2_id'].fillna(0).astype(int)
groups['category3_id'] = groups['category3_id'].fillna(0).astype(int)

timetable = pd.read_csv('timetable.csv')

attend = pd.read_csv('new_attend.csv')
attend = attend.sort_values(by=['event_id', 'event_date'])

users = pd.read_csv('../data/new_users.csv')
users['date_of_birth'] = users['date_of_birth'].astype(str).apply(lambda x: x[:11])
users['postal_code'] = users['postal_code']

In [46]:
districts.to_sql('districts', con=db, index=None)
zones.to_sql('zones', con=db, index=None)

group_locations.to_sql('group_locations', con=db, index=None)
groups.to_sql('groups', con=db, index=None)
timetable.to_sql('group_timetable', con=db, index=None)

attend.to_sql('attend', con=db, index=None)
users.to_sql('users', con=db, index=None)

In [47]:
cursor.execute("UPDATE groups SET category1_id = NULL WHERE category1_id = 0")
cursor.execute("UPDATE groups SET category2_id = NULL WHERE category2_id = 0")
cursor.execute("UPDATE groups SET category3_id = NULL WHERE category3_id = 0")

cursor.execute("UPDATE users SET postal_code = NULL WHERE postal_code = 0")
cursor.execute("UPDATE group_locations SET code = NULL WHERE code = 0")

db.commit()

In [48]:
cursor.execute("CREATE UNIQUE INDEX group_id_idx ON groups (group_id)")
cursor.execute("CREATE INDEX cats ON groups(category1_id, category2_id, category3_id)")

cursor.execute("CREATE INDEX group_id_attend_idx ON attend (group_id)")
cursor.execute("CREATE INDEX user_id_attend_idx ON attend (user_id)")

cursor.execute("CREATE INDEX tt_gid ON group_timetable (group_id)")
cursor.execute("CREATE INDEX start_date_idx ON group_timetable (start_date)")

cursor.execute("CREATE UNIQUE INDEX user_id_idx ON users (user_id)")


cursor.execute("CREATE UNIQUE INDEX zone_id_idx ON zones (zone_id)")
cursor.execute("CREATE UNIQUE INDEX district_id_idx ON districts (district_id)")
cursor.execute("CREATE INDEX group_id_loc ON group_locations (group_id)")

db.commit()