In [2]:
import requests
from tqdm.auto import tqdm
import pandas as pd
import csv
import re


#### __Файлы__:
* файлы с id городов, распределенных по странам ___(e. g. Russia.txt)___, где строка - _id,title_ (список с названиями стран СНГ - _cis_countries_);
* файлы с маршрутами ___(all_routes.csv)___ со столбцами _type_, _route_, _num_;
* файл с уникальными городами ___(cities_ids.txt)___, где строка - _id,title_.


Открываем файл со всеми путями, распределенными по полу и типу перемещения, создаем соответствующие датафреймы:

In [2]:
df_ = pd.read_csv('all_routes.csv', sep='\t')

df_work = df_[df_['type'] == 'work']
df_school = df_[df_['type'] == 'school']
df_stou = df_[df_['type'] == 'school_to_university']
df_university = df_[df_['type'] == 'university']
df_full = df_[df_['type'] == 'full']


Разделяем данные на сами маршруты и их количество:

In [3]:
routes_work_route = list(df_work['route'])
routes_school_route = list(df_school['route'])
routes_stou_route = list(df_stou['route'])
routes_university_route = list(df_university['route'])
routes_full_route = list(df_full['route'])

routes_work_num = list(df_work['num'])
routes_school_num = list(df_school['num'])
routes_stou_num = list(df_stou['num'])
routes_university_num = list(df_university['num'])
routes_full_num = list(df_full['num'])


Открываем списки городов каждой страны и заменяем каждый город в каждом маршруте на страну, которой он принадлежит:

In [4]:
def make_countries_routes(filename):
    with open(filename, 'r', encoding='utf-8') as country_file:
        country_cities = country_file.read().split('\n')
        cities = []
        del country_cities[-1]
        for i in country_cities:
            id_title = i.split(',')
            cities.append(id_title[0]) 
    return cities


In [5]:
russia = make_countries_routes('Russia.txt')

armenia = make_countries_routes('Armenia.txt')
azerbaijan = make_countries_routes('Azerbaijan.txt')
belarus = make_countries_routes('Belarus.txt')
kazakhstan = make_countries_routes('Kazakhstan.txt')
kyrgyzstan = make_countries_routes('Kyrgyzstan.txt')
moldova = make_countries_routes('Moldova.txt')
tajikistan = make_countries_routes('Tajikistan.txt')
turkmenistan = make_countries_routes('Turkmenistan.txt')
ukraine = make_countries_routes('Ukraine.txt')
uzbekistan = make_countries_routes('Uzbekistan.txt')
georgia = make_countries_routes('Georgia.txt')

latvia = make_countries_routes('Latvia.txt')
lithuania = make_countries_routes('Lithuania.txt') 
estonia = make_countries_routes('Estonia.txt')


In [7]:
with open('saint_p.txt', 'r', encoding='utf-8') as leningrad:
    leningrad_cities = leningrad.read().split('\n')
    
with open('krasnodar.txt', 'r', encoding='utf-8') as krasnodar:
    krasnodar_cities = krasnodar.read().split('\n')
    
with open('moscow.txt', 'r', encoding='utf-8') as moscow:
    moscow_cities = moscow.read().split('\n')
    

In [8]:
def get_route_by_countries(city_routes, countries, country_names):
    for country in tqdm(range(len(countries))):
        for city_route in range(len(city_routes)):
            cities = city_routes[city_route].split(', ')
            route = []
            for city in cities:
                if city in countries[country]:
                    route.append(country_names[country])
                else:
                    route.append(city)
            route = ', '.join(route)
            city_routes[city_route] = route
    for i in range(len(city_routes)):
        city_routes[i] = re.sub('-?[0-9]+', 'world', city_routes[i])
    return city_routes


In [14]:
cntrs = [leningrad_cities, krasnodar_cities, moscow_cities, russia, armenia, azerbaijan, belarus, georgia, kazakhstan, kyrgyzstan, moldova, tajikistan, turkmenistan, ukraine, uzbekistan, latvia, lithuania, estonia]
cntrs_names = ['leningrad', 'krasnodar', 'moscow', 'russia', 'armenia', 'azerbaijan', 'belarus', 'georgia', 'kazakhstan', 'kyrgyzstan', 'moldova', 'tajikistan', 'turkmenistan', 'ukraine', 'uzbekistan', 'latvia', 'lithuania', 'estonia']


In [15]:
all_routes = [routes_work_route, routes_school_route, routes_stou_route, routes_university_route, routes_full_route]


In [17]:
for i in range(0, len(all_routes)):
    all_routes[i] = get_route_by_countries(all_routes[i], cntrs, cntrs_names)
    

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




Записываем данные в csv-файл:

In [None]:
with open('country_routes.csv', 'a', encoding='utf-8') as csv_file:
    file_writer = csv.writer(csv_file, delimiter = "\t")
    file_writer.writerow(["type", "route", 'num'])
    for i in range(len(routes_work_route)):
        file_writer.writerow(['work', routes_work_route[i], routes_work_num[i]])
    for i in range(len(routes_school_route)):
        file_writer.writerow(['school', routes_school_route[i], routes_school_num[i]])
    for i in range(len(routes_stou_route)):
        file_writer.writerow(['school_to_university', routes_stou_route[i], routes_stou_num[i]])
    for i in range(len(routes_university_route)):
        file_writer.writerow(['university', routes_university_route[i], routes_university_num[i]])
    for i in range(len(routes_full_route)):
        file_writer.writerow(['full', routes_full_route[i], routes_full_num[i]])


Открываем файл с перемещениями по странам и делаем словари вида 'путь: количество', соединяя повторяющиеся пути:

In [3]:
df_ = pd.read_csv('country_routes.csv', sep='\t')

df_work = df_[df_['type'] == 'work']
df_school = df_[df_['type'] == 'school']
df_stou = df_[df_['type'] == 'school_to_university']
df_university = df_[df_['type'] == 'university']
df_full = df_[df_['type'] == 'full']


In [4]:
def make_dict(routes, nums):
    routes_nums = {}
    for i in range(len(routes)):
        try:
            routes_nums[routes[i]] += nums[i]
        except:
            routes_nums[routes[i]] = nums[i]
    return routes_nums

In [5]:
rw = make_dict(list(df_work['route']), list(df_work['num']))
rs = make_dict(list(df_school['route']), list(df_school['num']))
rsu = make_dict(list(df_stou['route']), list(df_stou['num']))
ru = make_dict(list(df_university['route']), list(df_university['num']))
rf = make_dict(list(df_full['route']), list(df_full['num']))


In [6]:
def delete_double_countries(routes):
    new_routes = {}
    for k, v in routes.items():
        key = k.split(', ')
        new_key = [key[0]]
        for i in range(1, len(key)):
            if key[i] != new_key[-1]:
                new_key.append(key[i])
        try:
            new_routes[', '.join(new_key)] += v
        except:
            new_routes[', '.join(new_key)] = v
    return new_routes
    

In [7]:
cts = ['moscow', 'leningrad', 'krasnodar', 'russia']

In [11]:
cntrs_names = ['armenia','azerbaijan','belarus','georgia','kazakhstan','kyrgyzstan']


Удаляем идущие подряд одинаковые страны в каждом маршруте:

In [10]:
unique_rw = delete_double_countries(rw)
unique_rs = delete_double_countries(rs)
unique_rsu = delete_double_countries(rsu)
unique_ru = delete_double_countries(ru)
unique_rf = delete_double_countries(rf)


In [27]:
def sort_by_values(dict1):
    d = {k: dict1[k] for k in sorted(dict1, key=dict1.get, reverse=True)}

    return d

Достаем первые страны в каждом маршруте и складываем, чтобы потом узнать процентное соотношение людей из разных стран для корректного анализа:

In [37]:
def get_first_country(country_routes_type):
    first_countries = {}
    for k, v in country_routes_type.items():
        key = k.split(', ')
        try:
            first_countries[key[0]] += v
        except:
            first_countries[key[0]] = v
    return first_countries


In [38]:
def get_last_country(country_routes_type):
    first_countries = {}
    for k, v in country_routes_type.items():
        key = k.split(', ')
        try:
            first_countries[key[-1]] += v
        except:
            first_countries[key[-1]] = v
    return first_countries


In [39]:
first_rw = get_first_country(unique_rw)
first_rs = get_first_country(unique_rs)
first_rsu = get_first_country(unique_rsu)
first_ru = get_first_country(unique_ru)
first_rf = get_first_country(unique_rf)


In [40]:
last_rw = get_last_country(unique_rw)
last_rs = get_last_country(unique_rs)
last_rsu = get_last_country(unique_rsu)
last_ru = get_last_country(unique_ru)
last_rf = get_last_country(unique_rf)


Проделываем то же самое, что и ранее, но обобщаем все страны СНГ:

In [46]:
cis_names = ['armenia', 'azerbaijan', 'belarus', 'georgia', 'kazakhstan', 'kyrgyzstan', 'moldova', 'tajikistan', 'turkmenistan', 'ukraine', 'uzbekistan']
baltics_names = ['latvia', 'lithuania', 'estonia']
rus_names = ['leningrad', 'krasnodar', 'moscow']


In [47]:
def make_cis_world_russia(cis_names, baltics_names, unique_srt):
    new_routes = {}
    for k, v in unique_srt.items():
        key = k.split(', ')
        new_key = []
        for country in key:
            if country in cis_names:
                new_key.append('cis')
            elif country in baltics_names:
                new_key.append('baltics')
            elif country in rus_names:
                new_key.append('russia')
            else:
                new_key.append(country)
        key_wo_doub = [new_key[0]]
        for i in range(1, len(new_key)):
            if new_key[i] != key_wo_doub[-1]:
                key_wo_doub.append(new_key[i])
        try:
            new_routes[', '.join(key_wo_doub)] += v
        except:
            new_routes[', '.join(key_wo_doub)] = v
    return new_routes


In [48]:
rucisbalt_rw = make_cis_world_russia(cis_names, baltics_names, unique_rw)
rucisbalt_rs = make_cis_world_russia(cis_names, baltics_names, unique_rs)
rucisbalt_rsu = make_cis_world_russia(cis_names, baltics_names, unique_rsu)
rucisbalt_ru = make_cis_world_russia(cis_names, baltics_names, unique_ru)
rucisbalt_rf = make_cis_world_russia(cis_names, baltics_names, unique_rf)


Снова находим первое количество людей с тем или иным местом проживания:

In [54]:
firstc_rw = get_first_country(rucisbalt_rw)
firstc_rs = get_first_country(rucisbalt_rs)
firstc_rsu = get_first_country(rucisbalt_rsu)
firstc_ru = get_first_country(rucisbalt_ru)
firstc_rf = get_first_country(rucisbalt_rf)


In [55]:
lastc_rw = get_last_country(rucisbalt_rw)
lastc_rs = get_last_country(rucisbalt_rs)
lastc_rsu = get_last_country(rucisbalt_rsu)
lastc_ru = get_last_country(rucisbalt_ru)
lastc_rf = get_last_country(rucisbalt_rf)
