In [1]:
import requests, bs4 
import pandas as pd
import numpy as np

# Scraping Moscow subway stations by district

In [2]:
# define url
url = 'https://metro.mwmoskva.ru/stancii-spisok/po_rajonam/'

In [3]:
# get info from url
res = requests.get(url) 
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text)

In [4]:
# select the list of stations per district
stations = soup.findAll("div", ["list-name","list-title"])

In [5]:
stations_dict = dict()
district = np.NaN
for name in stations:
    text = name.getText().strip()
    #district names have "в" or "на" before it. We use this for differentiation between subway and district names
    if ("на " in text) or ("в " in text): 
        district = text        
    else:
        stations_dict.update({text : district})

In [6]:
# convert dictionary to df
stations_by_district = pd.DataFrame.from_dict(stations_dict, orient='index')

In [7]:
# rename columns
stations_by_district.reset_index(inplace=True)
stations_by_district.columns = ['stations', 'district']

In [8]:
# save to csv
stations_by_district.to_csv('stations_by_district.csv', index=False)

# Scraping Moscow regions and districts

In [9]:
# define url
url = 'http://moskva-map.ru/rajony-moscow.htm'

In [10]:
# get info from url
res = requests.get(url) 
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text)

In [11]:
distr_dict = dict()
region = np.NaN
for name in soup.findAll('a'):
    text = name.getText().strip()  
    # Names of the regions are 4 letters long at most. 
    # We use this for differentiation between districts and regions
    if len(text) < 5:
        region = text        
    else:
        distr_dict.update({text : region})

In [12]:
region_district = pd.DataFrame.from_dict(distr_dict, orient='index')

In [13]:
# convert dictionary to df
region_district.reset_index(inplace=True)
region_district.columns = ['stations', 'district']

In [14]:
# clean
region_district = region_district[:-10]

In [15]:
# save to csv
region_district.to_csv('region_district.csv', index=False)