<a href="https://colab.research.google.com/github/ronaldexim/public_transport/blob/master/public_transport_warsaw.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install PyDrive

In [0]:
pip install --upgrade tables                                         # potrzebne do wczytania zawartości  plików hdf5 do colaba

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
import urllib.request, json, csv, h5py, geopy.distance
import pandas as pd
import numpy as np
import tables as tb
from tqdm import tqdm, tqdm_pandas
apikey = ''                                                 # apikey do uzyskania pod adresem https://api.um.warszawa.pl/

# Przygotowanie danych - nie trzeba uruchamiać

Przystanki

In [0]:
api = 'https://api.um.warszawa.pl/api/action/dbstore_get?id=ab75c33d-3a26-4342-b36a-6e5fef0a3ac3&apikey=' + apikey
with urllib.request.urlopen(api) as url:
    data = json.loads(url.read().decode())['result']
    data_list = []
    if (data != False):        
        for one in data:
            row_dict = {}
            for col in one['values']:
                row_dict[col['key']] = col['value']
            data_list.append(row_dict)
    else:
        print(type(data))
        print('API error')
stops_columns = ['zespol', 'nazwa_zespolu', 'slupek', 'id_ulicy', 'kierunek', 'dlug_geo', 'szer_geo', 'obowiazuje_od']
stops_df = pd.DataFrame(data_list, columns=stops_columns)
stops_df.head()

Szuka linii dla przystanków

In [0]:
def take_line_str(stop_id, stop_nr):
    api = 'https://api.um.warszawa.pl/api/action/dbtimetable_get?id=88cd555f-6f31-43ca-9de4-66c479ad5942' + \
        '&busstopId=' + stop_id + '&busstopNr=' + stop_nr + '&apikey=' + apikey
    try:
        with urllib.request.urlopen(api) as url:
            data = json.loads(url.read().decode())['result']
            if (type(data) == str):
                raise Exception('API error')
            row_str = ''
            for one in data:
                for col in one['values']:
                    row_str = row_str + '|' + col['value']
            return row_str
    except:
        print('error', stop_id, stop_nr)
        return ''

In [0]:
with open('lines.csv', 'w') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=',')
    for index, row in tqdm(stops_df.iterrows(), total=stops_df.shape[0]):
        result = [ row['zespol'], row['slupek'], take_line_str(row['zespol'], row['slupek']) ]
        csv_writer.writerow(result)

In [0]:
with open('lines.csv', mode='r') as csv_file:
    csv_reader = csv.reader(csv_file)
    id_lines_dict = {}
    for row in csv_reader:
        if row == []: 
            continue
        id_stop = row[0] + '_' + row[1]
        if (id_stop in id_lines_dict):
            if (id_lines_dict[id_stop] != row[2]):
                raise Exception('two or more different values for the key') 
        else:
            id_lines_dict[id_stop] = row[2]

for key in id_lines_dict.keys():
    id_lines_dict[key] = set(id_lines_dict[key].split('|')[1:])

stops_df['id_nu'] = stops_df[ ['zespol', 'slupek'] ].apply(lambda x: x['zespol'] + '_' + x['slupek'], axis=1)            
stops_df['line'] = stops_df[ ['id_nu'] ].apply(lambda x: id_lines_dict[ x['id_nu'] ], axis=1)
stops_df.head()

Feature engineering

In [0]:
stops_df['changed'] = stops_df.groupby(['id_nu'])['id_nu'].transform('count')                                             # ile było zmian dla przystanku
stops_df['no_line'] = stops_df[ ['line'] ].apply(lambda x: len(x['line']) == 0, axis=1)                                   # przystanek bez linii
stops_df['error'] = stops_df[ ['dlug_geo', 'szer_geo'] ].apply(lambda x: x['szer_geo'] < '40', axis=1)                    # zamienione długość i szerokość geograficzna

In [0]:
stops_df[['dlug_geo','szer_geo']] = \
    stops_df[['szer_geo','dlug_geo']].where(stops_df['error'], stops_df[['dlug_geo','szer_geo']].values)                  # zamienia długość i szerokość dla 'error' == True


In [0]:
stops_df['obowiazuje_od'] = pd.to_datetime(stops_df['obowiazuje_od'])
stops_df['last_change'] = stops_df.groupby(['id_nu'])['obowiazuje_od'].transform('max')                                   # data ostatniej zmiany
stops_df['actual'] = stops_df[ ['obowiazuje_od', 'last_change'] ].\
    apply(lambda x: x['obowiazuje_od'] == x['last_change'], axis=1)                                                       # aktualna lokalizacja przystanku - po ostatniej zmianie
stops_df['deleted'] = stops_df[ ['actual', 'szer_geo'] ].apply(lambda x: x['actual'] & (x['szer_geo'] == 'null'), axis=1) # przystanek skasowany - ostatnia zmiana bez lokalizacji

In [0]:
stops_df['dlug_f'] = pd.to_numeric(stops_df['dlug_geo'], errors='coerce')
stops_df['szer_f'] = pd.to_numeric(stops_df['szer_geo'], errors='coerce')
stops_df['coord'] = stops_df[ ['dlug_f', 'szer_f'] ].apply(lambda x: (x['szer_f'], x['dlug_f']), axis=1)

Lista linii

In [0]:
def find_stops(line):
    bool_tab = [line in lines_set for lines_set in stops_df['line'].tolist()]
    return (stops_df[bool_tab].query('actual').query('~deleted')['id_nu'].tolist())

lines_set = set.union(*stops_df['line'].tolist())
lines_df = pd.DataFrame(sorted(list(lines_set)), columns=['line_nr'])
lines_df['stops'] = lines_df['line_nr'].map(lambda x: find_stops(x))                     # lista przystanków aktualnych i nie skasowanych dla lini
lines_df['stops_nr'] = lines_df['stops'].map(lambda x: len(x))                           # ilość przystanków
lines_df.head()

Rozkłady

In [0]:
def take_schedule(id_nu, line):
    [stop_id, stop_nr] = id_nu.split('_')
    api = 'https://api.um.warszawa.pl/api/action/dbtimetable_get?id=e923fa0e-d96c-43f9-ae6e-60518c9f3238' + \
        '&busstopId=' + stop_id + '&busstopNr=' + stop_nr + '&line=' + line + '&apikey=' + apikey
    try:
        with urllib.request.urlopen(api) as url:
            data_list = []
            data = json.loads(url.read().decode())['result']
            if (type(data) == str):
                raise Exception('API error')
            for one in data:
                row_dict = {}
                row_dict['line'] = line
                row_dict['id_nu'] = id_nu
                for col in one['values']:
                    row_dict[col['key']] = col['value']
                data_list.append(row_dict)
            return data_list
    except:
        print('error', stop_id, stop_nr, line)
        return

In [0]:
schedule_columns = ['line', 'id_nu', 'brygada', 'kierunek', 'trasa', 'czas', 'symbol_1', 'symbol_2']
if 'schedule_df' not in dir():
    schedule_df = pd.DataFrame([], columns=schedule_columns)
    print('starting new schedule_df')
else:
    return #schedule done
    print('adding to schedule_df')
    
for index, line in lines_df.iterrows():
    for stop in line['stops']:       
        print(line['line_nr'], stop)     
        one = take_schedule(stop, line['line_nr'])
        one_df = pd.DataFrame(one, columns=schedule_columns)
        schedule_df = pd.concat([schedule_df, one_df])

In [0]:
#stops_df.to_hdf('public_transport_warsaw.h5', key='stops_df_4')
#lines_df.to_hdf('public_transport_warsaw.h5', key='lines_df')
#schedule_df.to_hdf('public_transport_warsaw2.h5', key='schedule_df')

In [0]:
with h5py.File('public_transport_warsaw.h5','r') as hf:
    dataset_names = list(hf.keys())
    print(dataset_names)
    
with h5py.File('public_transport_warsaw2.h5','r') as hf:
    dataset_names = list(hf.keys())
    print(dataset_names)

['lines_df', 'stops_df', 'stops_df_2', 'stops_df_3', 'stops_df_4']
['schedule_df']


'stops_df' zawiera ~12k przystanków - tyle miał json za pierwszym razem 

później było po ~7700 przystanków i to dalej odpytywałem

# Wczytanie przygotowanych danych z plików

Dane jeśli ktoś chce je obrabiać lokalnie

https://drive.google.com/drive/folders/1-B3xscQ9NgU-QMWstIpq5NmBKneL3mWI?usp=sharing

In [0]:
downloaded = drive.CreateFile({'id': '1FiPQf6rSEtwwKwzAyKoqzMbScSG_jvBF'})
downloaded.GetContentFile('lines.csv')
downloaded = drive.CreateFile({'id': '1ypIZUgPuor48qKt9ChwY6r3bi-tOf6b2'})
downloaded.GetContentFile('public_transport_warsaw.h5')
downloaded = drive.CreateFile({'id': '1yvskGIjN-7jY7i3UZ4yck4DZtPagxyYJ'})
downloaded.GetContentFile('public_transport_warsaw2.h5')

In [0]:
stops_df = pd.read_hdf('public_transport_warsaw.h5', key='stops_df_4', mode='r')
lines_df = pd.read_hdf('public_transport_warsaw.h5', key='lines_df', mode='r')
schedule_df = pd.read_hdf('public_transport_warsaw2.h5', key='schedule_df', mode='r')
print(stops_df.shape, lines_df.shape, schedule_df.shape)

(7727, 19) (345, 3) (918719, 8)


In [0]:
lines_df[lines_df.line_nr == '146']['stops'].tolist()[0]     # przykładowa lista przystanków - ustawienie alfabetyczne a nie wg trasy

In [0]:
def dist_id_nu(nr1_id, nr2_id):                                                                             # podaje odległość między przystankami (id)
    coord1 = stops_df[stops_df.id_nu == nr1_id].query('actual').query('~deleted')['coord'].tolist()[0]
    coord2 = stops_df[stops_df.id_nu == nr2_id].query('actual').query('~deleted')['coord'].tolist()[0]
    return geopy.distance.distance(coord1, coord2)

def dist_stops(line, nr1, nr2):                                                                             # podaje odległość między przystankami (nr kolejny w liście)
    nr1_id = lines_df[lines_df.line_nr == line]['stops'].tolist()[0][nr1]
    nr2_id = lines_df[lines_df.line_nr == line]['stops'].tolist()[0][nr2]
    return dist_id_nu(nr1_id, nr2_id)

In [0]:
dist_stops('146', 0, 1)