# Data Collection

In [8]:
import requests
import bs4
import json
import pandas as pd
import re
import datetime as dt
import numpy as np
import os

import locale
import datetime as dt

locale.setlocale(locale.LC_TIME, 'ru_RU.UTF-8') # the ru locale is installed

'ru_RU.UTF-8'

In [3]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry


def requests_retry_session(
    retries=5,
    backoff_factor=1,
    status_forcelist=(500, 502, 503, 504),
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session


def get_site_map(url, limit=200000):
    params = {
        'url': url,
        'fl': 'timestamp,original',
        'matchType': 'prefix',
        'filter': ['statuscode:200', 'mimetype:text/html'],
        'collapse': ['original', 'timestamp'],
        'limit': limit
    }
    response = requests.get('https://web.archive.org/web/timemap/json', params=params)

    return response.json()



def get_archive_version(timestamp, url):
    query_url_path = '/web/{timestamp}/{url}'.format(timestamp=timestamp, url=url)
    headers = {'path': query_url_path,
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate'}
    query_url = 'https://web.archive.org' + query_url_path
    response = requests_retry_session(retries=10).get(query_url, headers=headers)
    # move logging here
    # LOG INTERPOLATION
    return response

In [4]:
site_map = get_site_map('https://xn--80aesfpebagmfblc0a.xn--p1ai/information')

In [5]:
dfs = []

for i, (ts, url) in enumerate(site_map[1:]):
    print(i)
    if 'information' not in url:
        continue
    response = get_archive_version(ts, url)
    soup = bs4.BeautifulSoup(response.text)
    data_in = soup.find('cv-spread-overview')
    if data_in is None:
        print(ts, url)
    else:
        data = json.loads(data_in[':spread-data'])
        new_rows = []
        for row in data:
            try:
                new_rows.append(
                    dict(
                        row, **row.get('isolation', {}),
                        timestamp=ts,
                        date=bs4.BeautifulSoup(row.get('isolation', {'descr': '<p></p>'})['descr']).p.text
                    )
                )
            except TypeError:
                print(row)
        dfs.append(pd.DataFrame.from_records(new_rows))

joint_data = pd.concat(dfs)
joint_data.to_csv('map.csv')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [366]:
data_1 = []
data_2 = []

for i in range(100):
    response = requests.get('https://www.rospotrebnadzor.ru/about/info/news', params={'PAGEN_1':i})
    for link in bs4.BeautifulSoup(response.text).findAll('a'):
        link_str = link.get('href', '')
        if 'ELEMENT_ID' in link_str and 'тестировании на новую коронавирусную' in link.text:
            response = requests.get('https://www.rospotrebnadzor.ru' + link_str)
            datasets = pd.read_html(response.text, header=0, decimal=',', thousands='')
            date = bs4.BeautifulSoup(response.text).find('p', **{'class': "date"}).text
            print(date)
            datasets[0].columns = ['N', 'Cубьект', 'Тесты на 100000 нас']
            datasets[0]['date'] = date
            datasets[1].columns = ['N', 'Cубьект', 'Тесты']
            datasets[1]['date'] = date
            data_1.append(datasets[0])
            data_2.append(datasets[1])
            
pd.concat(data_1).to_csv('tests_per_100000.csv')
pd.concat(data_2).to_csv('tests.csv')

16.12.2020 г.
16.12.2020 г.
09.12.2020 г.
02.12.2020 г.
25.11.2020 г.
18.11.2020 г.
11.11.2020 г.
04.11.2020 г.
28.10.2020 г.
21.10.2020 г.
14.10.2020 г.
07.10.2020 г.
30.09.2020 г.
23.09.2020 г.
16.09.2020 г.
09.09.2020 г.
02.09.2020 г.
26.08.2020 г.
20.08.2020 г.
12.08.2020 г.
05.08.2020 г.
29.07.2020 г.
22.07.2020 г.
15.07.2020 г.
09.07.2020 г.
02.07.2020 г.
25.06.2020 г.
17.06.2020 г.
10.06.2020 г.
03.06.2020 г.
27.05.2020 г.
20.05.2020 г.
13.05.2020 г.
07.05.2020 г.
29.04.2020 г.
22.04.2020 г.


ValueError: Length mismatch: Expected axis has 6 elements, new values have 3 elements

In [367]:
pd.concat(data_1).to_csv('tests_per_100000.csv')
pd.concat(data_2).to_csv('tests.csv')

In [585]:
def covert_xlsx_industrial():
    tables = []

    for file in os.listdir('districts'):
        if '.xls' not in file:
        print(file)
        region = pd.read_excel('districts/' + file).columns.values[0]
        data = pd.read_excel('districts/' + file, header=[2,3])
        data = data.melt(id_vars=[('Unnamed: 0_level_0', 'Unnamed: 0_level_1')])
        data['region'] = re.findall('\(человек, .*\)', region)[0][10:-1]
        tables.append(data)

    pd.concat(tables).to_csv('industrial_data.csv')
    pd.concat(tables).to_excel('industrial_data.xlsx')
    
covert_xlsx_industrial()

Воронежская область.xls
Алтайский край.xls
Волгоградская область.xls
Мурманская область.xls
Ростовская область.xls
Саратовская область.xls
Республика Мордовия.xls
Забайкальский край.xls
Оренбургская область.xls
Республика Алтай.xls
Тульская область.xls
Город Москва столица Российской Федерации город федерального значения.xls
Свердловская область.xls
Приморский край.xls
Орловская область.xls
Республика Северная Осетия-Алания.xls
Камчатский край.xls
Архангельская область.xls
Пензенская область.xls
Самарская область.xls
Амурская область.xls
Вологодская область.xls
Ульяновская область.xls
Карачаево-Черкесская Республика.xls
Ивановская область.xls
Челябинская область.xls
Калужская область.xls
Нижегородская область.xls
Курганская область.xls
Хабаровский край.xls
Астраханская область.xls
Город Санкт-Петербург город федерального значения.xls
Рязанская область.xls
Еврейская автономная область.xls
Республика Коми.xls
Ямало-Ненецкий автономный округ (Тюменская область).xls
Смоленская область.xls


In [440]:
def covert_xlsx_pop():
    tables = []
    for file in os.listdir('population'):
        if '.xlsx' not in file:
            continue
        data = pd.read_excel('population/' + file, header=[5,6,7])
        data = data.drop('и женщины', level=2, axis=1)
        data = data.droplevel(level=2, axis=1)
        
        data = data.loc[:, ['Возраст (лет)', 'Все население']]
        data = data.droplevel(level=0, axis=1)
        data.columns = ['Возраст (лет)', 'мужчины', 'женщины']
        
        data['region'] = pd.read_excel('population/' + file).iloc[2,1]
        tables.append(data)
        
        
#    return tables
    
    pd.concat(tables).to_csv('population.csv')
    pd.concat(tables).to_excel('population.xlsx')

tbls = covert_xlsx_pop()

# Data merge

In [9]:
def create_weeks(dataset, field):
    dataset['week'] = pd.to_datetime(dataset.apply(
        lambda row: dt.date.fromisocalendar(2020, row[field], 1), axis=1
    ))
    return dataset

def rename_regions(column):
    column = column.str.replace(
        ' область', ''
    ).str.replace(
        'Республика ', ''
    ).str.replace(
        r'г\. ', ''
    ).str.replace(
        'автономный округ', 'АО'
    ).str.replace(
        'автономная', 'АО'
    ).str.replace(
        'Ханты-Мансийский ', 'ХМ'
    ).str.replace(
        'Кабардино-Балкарская Республика', 'Кабардино-Балкария'
    ).str.replace(
        'Чувашская Республика', 'Чувашия'
    ).str.replace(
        'Чеченская Республика', 'Чечня'
    ).str.replace(
        'Удмуртская Республика', 'Удмуртия'
    ).str.replace(
        'Карачаево-Черкесская Республика', 'Карачаево-Черкессия'
    ).str.replace(
        '-Алания', ''
    ).str.replace(' обл.', '').str.replace(
        '–', '-'
    ).str.replace(
        r'Г\. ',''
    ).str.replace(
        r'г\. ',''
    ).str.replace(
        'Город федерального значения', ''
    ).str.replace(
        'Город ', ''
    ).str.replace(
        'город федерального значения', ''
    ).str.replace(
        'город ', ''
    ).str.replace(
        r'\(.*\)',''
    ).str.replace(
        'Горный ',''
    ).str.replace('автономный округ', 'АО').str.replace(
        '-Югра', ''
    ).str.replace(
        '- Югра', ''
    ).str.replace(
        '- Югра', ''
    ).str.replace('без автономного округа', '').str.replace(
        'Чувашская -', ''
    ).str.replace(
        ' столица Российской Федерации', ''
    ).str.replace(
        ' - Кузбасс', ''
    ).str.replace(
        r'\*', ''
    ).str.replace(
         'без автономных округов', ''
    ).str.replace(
         'Oмская', 'Омская'
    ).str.replace(
         '— Алания', ''
    ).str.strip()
    return column

In [27]:
def get_region_codes():
    region_codes = pd.read_csv('region_codes.csv', sep=';')
    region_codes.region = rename_regions(region_codes.region)
    return region_codes

def get_population_data():
    pop_data = pd.read_csv('population.csv')
    pop_data = pop_data.loc[pop_data['Возраст (лет)'] == 'Итого']
    pop_data = pop_data.reset_index()
    pop_data['population'] = pop_data['мужчины'] + pop_data['женщины']
    pop_data['region'] = rename_regions(pop_data['region'])
    # structure here
    return pop_data

# двойной счет!

def get_requests_dataset(drop_industries=[], classify=True):
    main_dataset2 = pd.read_csv('unemployed_2_data.csv', sep=';')
    main_dataset2['cv_gender'] = main_dataset2['cv_gender'] == 'Мужской'
    main_dataset2.loc[main_dataset2['cv_birthday'] == 'None','cv_birthday'] = np.nan
    main_dataset2['cv_birthday_2000'] = main_dataset2['cv_birthday'].astype(float) > 2000
    main_dataset2['cv_birthday_1990'] = main_dataset2['cv_birthday'].astype(float) > 1990
    main_dataset2['cv_birthday_1980'] = main_dataset2['cv_birthday'].astype(float) > 1980
    main_dataset2['cv_birthday_1970'] = main_dataset2['cv_birthday'].astype(float) > 1970
    main_dataset2['cv_birthday_1960'] = main_dataset2['cv_birthday'].astype(float) > 1960
    main_dataset2 = main_dataset2.groupby('user_uuid').agg(
        {'week_create': 'min', 'cv_industry' : np.unique, 'state_region_code': 'min',
        'cv_gender': 'min', 'cv_birthday_2000': 'min', 'cv_birthday_1990': 'min',
         'cv_birthday_1980': 'min', 'cv_birthday_1970': 'min', 'cv_birthday_1960': 'min'}
    )
    main_dataset2['unemployed'] = 1
    main_dataset2.reset_index(inplace=True)
    grouped = main_dataset2.groupby(['week_create', 'cv_industry', 'state_region_code']).sum()
    grouped = grouped.reset_index()
    grouped = create_weeks(grouped, 'week_create')
    
    # industries
    if classify:
        classifier = pd.read_csv('industries_classifier.csv', sep=';')
        merged_dataset = pd.merge(grouped, classifier, on='cv_industry')
        merged_dataset['unemployed'] = (
            merged_dataset.set_index(['cv_industry'])['unemployed']  /
            classifier.groupby('cv_industry').count()['oced']
        ).reset_index(drop=True)
        merged_dataset = merged_dataset.loc[~merged_dataset['cv_industry'].isin(drop_industries)]
        merged_dataset = merged_dataset.groupby(
            ['oced', 'week', 'state_region_code']
        ).sum()[['unemployed']].reset_index()
    else:
        merged_dataset = grouped
        merged_dataset['oced'] = merged_dataset['cv_industry']
    
    # regions
    region_codes = get_region_codes()
    with_region = pd.merge(merged_dataset, region_codes, left_on='state_region_code', right_on='code_short')
    return with_region

def get_covid_data():
    covid_data =  pd.read_csv('covid_table.csv', sep=';')
    covid_data['region'] = rename_regions(covid_data['Регион'])
    covid_data['week'] = pd.to_datetime(covid_data['Дата'])
    covid_data = covid_data.groupby(['region', pd.Grouper(key='week', freq='W-MON')]).sum()
    return covid_data.reset_index()

def get_industrial_data():
    industrial_data =  pd.read_csv('industrial_data.csv')
    industrial_data['oced'] = industrial_data["('Unnamed: 0_level_0', 'Unnamed: 0_level_1')"]
    industrial_data['month'] = industrial_data['variable_0']
    industrial_data['region'] = rename_regions(industrial_data['region'])
    industrial_data['employed_in_industry_jan'] = industrial_data['value']
    del industrial_data['value']
    del industrial_data['variable_0']
    del industrial_data["('Unnamed: 0_level_0', 'Unnamed: 0_level_1')"]
    industrial_data = industrial_data.loc[industrial_data['month'] == 'январь'] # 'август', 'апрель', 'июль', 'июнь', 'май', 'март', 'сентябрь', 'февраль', 
    return industrial_data

def get_lockdowns():
    data = pd.read_csv('map.csv')
    data['region'] = rename_regions(data['title'])
    data = data[~data.date.isna()]
    data.date = data.date.str.replace(r'.', '')
    data = data.loc[data.date.str.contains('Данные по состоянию на')]
    data['week'] = pd.to_datetime(
        data.apply(
            lambda row:
            dt.datetime.strptime(
            re.match(
                r'.*Данные по состоянию на(.*)$', row['date']
            ).group(1).strip().replace('kz', 'ля'),
            '%d %B'
            ).replace(year = 2020),
            axis=1
        )
    ).dt.to_period('W').apply(lambda r: r.start_time)
    data = data.drop_duplicates(['week', 'region'])
    
    return data

def get_tests():
    tests = pd.read_csv('tests.csv')
    tests['region'] = rename_regions(tests['Cубьект'])
    tests['date'] = pd.to_datetime(tests['date'].str.replace('г.', '').str.strip(), format='%d.%m.%Y')
    pivot = tests.pivot_table('Тесты', 'date', 'region')
    tests_per = pd.read_csv('tests_per_100000.csv')
    tests_per['region'] = rename_regions(tests_per['Cубьект'])
    tests_per['date'] = pd.to_datetime(tests_per['date'].str.replace('г.', '').str.strip(), format='%d.%m.%Y')
    pivot_per = tests_per.pivot_table('Тесты на 100000 нас', 'date', 'region')
    population = get_population_data()
    pivot_not_per = pivot_per.multiply(population.set_index('region')['population'], axis='columns')/100000
    pivot.update(pivot_per)
    pivot = pivot.interpolate(method='time')
    pivot = pivot.reset_index()
    pivot['week'] = pivot['date'].dt.to_period('W').apply(lambda r: r.start_time)
    del pivot['date']
    pivot = pivot.groupby(['week']).min()
    pivot = pivot.diff(periods=1, axis=0)
    pivot = pivot.reset_index()
    tests = pivot.melt(value_name='tests', id_vars='week')
    tests.loc[tests.tests == 0] = np.nan
    # date is not always on date
    return tests

def get_index():
    data = pd.read_excel('Index_Rasprostranenia_koronavirusa.xlsx')
    data['region'] = rename_regions(data['Регион'])
    data['week'] = pd.to_datetime(data['Дата'])
    data = data[['region', 'Rt', 'week']]
    return data

In [25]:
def relabel_results(result):
    result = result[[
        'week',
        'unemployed',
        'employed_in_industry_jan',
        'tests',
        'population',
        'мужчины',
        'женщины',
        'Смертей за день',
        'Заражений за день',
        'Выздоровлений за день',
        'region',
        'oced',
        'level',
        'Rt',
        'cv_birthday_2000',
        'cv_birthday_1990',
        'cv_birthday_1980',
        'cv_birthday_1970',
        'cv_birthday_1960',
        'cv_gender'
    ]]

    result = result.rename({'мужчины': 'men', 'женщины': 'women', 'Смертей за день' : 'deaths',
        'Заражений за день' : 'infections',
        'Выздоровлений за день': 'healed'}, axis=1)
    return result

In [824]:
main_dataset = get_requests_dataset(drop_industries=['Работы, не требующие квалификации'])
covid_data =  get_covid_data()
industrial_data = get_industrial_data()
tests_data = get_tests()
population_data = get_population_data()
lockdowns = get_lockdowns()

industrial_data_merged = pd.merge(main_dataset, industrial_data, on=['oced', 'region'], how='left')
tests_merged = pd.merge(industrial_data_merged, tests_data, on=['week', 'region'], how='left')
pop_merged = pd.merge(tests_merged, population_data, on=['region'], how='left')
result = pd.merge(pop_merged, covid_data, on=['week', 'region'], how='left')
result = pd.merge(result, lockdowns, on=['week', 'region'], how='left')
result = result.groupby(['region', 'oced']).apply(lambda x: x.sort_values('week', ascending=True)).reset_index(drop=True)

save = result[['region', 'oced']]
result = result.groupby(['region', 'oced']).fillna(method='ffill')
result[['region', 'oced']] = save

result = relabel_results(result)
result.to_csv('final_no_misc.csv')
result.to_excel('final_no_misc.xlsx')
result

  if (await self.run_code(code, result,  async_=asy)):


Unnamed: 0,week,unemployed,employed_in_industry_jan,tests,population,men,women,deaths,infections,healed,region,oced,level
0,2020-04-06,1.0,1390.0,,463088.0,217160.0,245928.0,0.0,42.0,44.0,Адыгея,"ВОДОСНАБЖЕНИЕ; ВОДООТВЕДЕНИЕ, ОРГАНИЗАЦИЯ СБОР...",
1,2020-04-13,11.5,1390.0,,463088.0,217160.0,245928.0,2.0,278.0,233.0,Адыгея,"ВОДОСНАБЖЕНИЕ; ВОДООТВЕДЕНИЕ, ОРГАНИЗАЦИЯ СБОР...",
2,2020-04-20,8.5,1390.0,,463088.0,217160.0,245928.0,2.0,80.0,11.0,Адыгея,"ВОДОСНАБЖЕНИЕ; ВОДООТВЕДЕНИЕ, ОРГАНИЗАЦИЯ СБОР...",
3,2020-04-27,9.0,1390.0,,463088.0,217160.0,245928.0,1.0,42.0,16.0,Адыгея,"ВОДОСНАБЖЕНИЕ; ВОДООТВЕДЕНИЕ, ОРГАНИЗАЦИЯ СБОР...",
4,2020-05-04,6.0,1390.0,,463088.0,217160.0,245928.0,0.0,21.0,16.0,Адыгея,"ВОДОСНАБЖЕНИЕ; ВОДООТВЕДЕНИЕ, ОРГАНИЗАЦИЯ СБОР...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...
44116,2020-09-28,25.0,31360.0,,1253389.0,563504.0,689885.0,1.0,345.0,311.0,Ярославская,ТРАНСПОРТИРОВКА И ХРАНЕНИЕ,3.0
44117,2020-10-05,22.0,31360.0,,1253389.0,563504.0,689885.0,0.0,167.0,81.0,Ярославская,ТРАНСПОРТИРОВКА И ХРАНЕНИЕ,3.0
44118,2020-10-12,20.0,31360.0,,1253389.0,563504.0,689885.0,0.0,611.0,809.0,Ярославская,ТРАНСПОРТИРОВКА И ХРАНЕНИЕ,3.0
44119,2020-10-19,38.0,31360.0,,1253389.0,563504.0,689885.0,2.0,677.0,440.0,Ярославская,ТРАНСПОРТИРОВКА И ХРАНЕНИЕ,3.0


In [29]:
main_dataset = get_requests_dataset(classify=False)
covid_data =  get_covid_data()
industrial_data = get_industrial_data()
tests_data = get_tests()
population_data = get_population_data()
lockdowns = get_lockdowns()
index = get_index()

industrial_data_merged = pd.merge(main_dataset, industrial_data, on=['oced', 'region'], how='left')
tests_merged = pd.merge(industrial_data_merged, tests_data, on=['week', 'region'], how='left')
pop_merged = pd.merge(tests_merged, population_data, on=['region'], how='left')
result = pd.merge(pop_merged, covid_data, on=['week', 'region'], how='left')
result = pd.merge(result, lockdowns, on=['week', 'region'], how='left')
result = pd.merge(result, index, on=['week', 'region'], how='left')
result = result.groupby(['region', 'oced']).apply(lambda x: x.sort_values('week', ascending=True)).reset_index(drop=True)

save = result[['region', 'oced']]
result = result.groupby(['region', 'oced']).fillna(method='ffill')
result[['region', 'oced']] = save

result = relabel_results(result)
result.to_csv('final_other_class.csv')
result.to_excel('final_other_class.xlsx')
result

  if (await self.run_code(code, result,  async_=asy)):


Unnamed: 0,week,unemployed,employed_in_industry_jan,tests,population,men,women,deaths,infections,healed,region,oced,level,Rt,cv_birthday_2000,cv_birthday_1990,cv_birthday_1980,cv_birthday_1970,cv_birthday_1960,cv_gender
0,2020-04-06,87,,,463088.0,217160.0,245928.0,0.0,42.0,44.0,Адыгея,,,0.873684,0,0,0,0,0,0
1,2020-04-13,301,,,463088.0,217160.0,245928.0,2.0,278.0,233.0,Адыгея,,,1.166667,0,0,0,0,0,0
2,2020-04-20,271,,,463088.0,217160.0,245928.0,2.0,80.0,11.0,Адыгея,,,2.478261,0,0,0,0,0,0
3,2020-04-27,419,,,463088.0,217160.0,245928.0,1.0,42.0,16.0,Адыгея,,,1.217391,0,0,0,0,0,0
4,2020-05-04,202,,,463088.0,217160.0,245928.0,0.0,21.0,16.0,Адыгея,,,1.000000,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76214,2020-09-28,5,,,1253389.0,563504.0,689885.0,1.0,345.0,311.0,Ярославская,Юриспруденция,3.0,1.046632,0,0,4,5,5,1
76215,2020-10-05,3,,,1253389.0,563504.0,689885.0,0.0,167.0,81.0,Ярославская,Юриспруденция,3.0,0.951220,0,1,1,3,3,2
76216,2020-10-12,7,,,1253389.0,563504.0,689885.0,0.0,611.0,809.0,Ярославская,Юриспруденция,3.0,1.004000,0,3,5,7,7,2
76217,2020-10-19,8,,,1253389.0,563504.0,689885.0,2.0,677.0,440.0,Ярославская,Юриспруденция,3.0,1.203488,0,2,5,7,7,1


In [28]:
main_dataset = get_requests_dataset()
covid_data =  get_covid_data()
industrial_data = get_industrial_data()
tests_data = get_tests()
population_data = get_population_data()
lockdowns = get_lockdowns()
index = get_index()

industrial_data_merged = pd.merge(main_dataset, industrial_data, on=['oced', 'region'], how='left')
tests_merged = pd.merge(industrial_data_merged, tests_data, on=['week', 'region'], how='left')
pop_merged = pd.merge(tests_merged, population_data, on=['region'], how='left')
result = pd.merge(pop_merged, covid_data, on=['week', 'region'], how='left')
result = pd.merge(result, lockdowns, on=['week', 'region'], how='left')
result = pd.merge(result, index, on=['week', 'region'], how='left')
result = result.groupby(['region', 'oced']).apply(lambda x: x.sort_values('week', ascending=True)).reset_index(drop=True)

save = result[['region', 'oced']]
result = result.groupby(['region', 'oced']).fillna(method='ffill')
result[['region', 'oced']] = save

result = relabel_results(result)
result.to_csv('final.csv')
result.to_excel('final.xlsx')
result

KeyboardInterrupt: 

In [22]:
result[result['employed_in_industry_jan'].isna(),]

NameError: name 'result' is not defined

Unnamed: 0,week,unemployed,employed_in_industry_jan,tests,population,men,women,deaths,infections,healed,region,oced,level,Rt
0,2020-04-06,4.333333,1390.0,,463088.0,217160.0,245928.0,0.0,42.0,44.0,Адыгея,"ВОДОСНАБЖЕНИЕ; ВОДООТВЕДЕНИЕ, ОРГАНИЗАЦИЯ СБОР...",,0.873684
1,2020-04-13,21.000000,1390.0,,463088.0,217160.0,245928.0,2.0,278.0,233.0,Адыгея,"ВОДОСНАБЖЕНИЕ; ВОДООТВЕДЕНИЕ, ОРГАНИЗАЦИЯ СБОР...",,1.166667
2,2020-04-20,16.333333,1390.0,,463088.0,217160.0,245928.0,2.0,80.0,11.0,Адыгея,"ВОДОСНАБЖЕНИЕ; ВОДООТВЕДЕНИЕ, ОРГАНИЗАЦИЯ СБОР...",,2.478261
3,2020-04-27,14.166667,1390.0,,463088.0,217160.0,245928.0,1.0,42.0,16.0,Адыгея,"ВОДОСНАБЖЕНИЕ; ВОДООТВЕДЕНИЕ, ОРГАНИЗАЦИЯ СБОР...",,1.217391
4,2020-05-04,9.333333,1390.0,,463088.0,217160.0,245928.0,0.0,21.0,16.0,Адыгея,"ВОДОСНАБЖЕНИЕ; ВОДООТВЕДЕНИЕ, ОРГАНИЗАЦИЯ СБОР...",,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44891,2020-09-28,25.000000,31360.0,,1253389.0,563504.0,689885.0,1.0,345.0,311.0,Ярославская,ТРАНСПОРТИРОВКА И ХРАНЕНИЕ,3.0,1.046632
44892,2020-10-05,22.000000,31360.0,,1253389.0,563504.0,689885.0,0.0,167.0,81.0,Ярославская,ТРАНСПОРТИРОВКА И ХРАНЕНИЕ,3.0,0.951220
44893,2020-10-12,20.000000,31360.0,,1253389.0,563504.0,689885.0,0.0,611.0,809.0,Ярославская,ТРАНСПОРТИРОВКА И ХРАНЕНИЕ,3.0,1.004000
44894,2020-10-19,38.000000,31360.0,,1253389.0,563504.0,689885.0,2.0,677.0,440.0,Ярославская,ТРАНСПОРТИРОВКА И ХРАНЕНИЕ,3.0,1.203488


KeyError: "['week'] not in index"