<a href="https://colab.research.google.com/github/guzmanlopez/montevideo-bus-forecast/blob/main/montevideo_bus_forecast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Curso Aprendizaje Automático para Datos en Grafos

**Docente:** Prof. Gonzalo Mateos (Universidad de Rochester, EEUU).

**Docente invitado:** Fernando Gama (Universidad de California Berkeley, EEUU).

**Otros docentes:** Marcelo Fiori y Federico La Rocca.

**Fechas:** 01/02/2021 al 04/02/2021 y 11/02/2021.

**Web:** [Página principal del curso en plataforma Eva](https://eva.fing.edu.uy/course/view.php?id=1484)



---



## Proyecto final del curso

### Predicción del flujo de pasajeros en las paradas de ómnibus del Sistema de Transporte Metropolitano (STM) de Montevideo

**Estudiante:** Guzmán López


---



Montar drive para descargar el repositorio del proyecto desde GitHub

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

%cd gdrive/My Drive/

Mounted at /content/gdrive
/content/gdrive/My Drive


In [2]:
!git clone https://github.com/guzmanlopez/montevideo-bus-forecast.git

fatal: destination path 'montevideo-bus-forecast' already exists and is not an empty directory.


In [5]:
%cd montevideo-bus-forecast/
!git pull

/content/gdrive/My Drive/montevideo-bus-forecast
remote: Enumerating objects: 13, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 8 (delta 6), reused 8 (delta 6), pack-reused 0[K
Unpacking objects: 100% (8/8), done.
From https://github.com/guzmanlopez/montevideo-bus-forecast
   053ddfa..e138f90  main       -> origin/main
Updating 053ddfa..e138f90
Fast-forward
 README.md                    | 21 [32m+++++++++++++++++++++[m
 src/preparation/constants.py |  4 [32m+[m[31m---[m
 2 files changed, 22 insertions(+), 3 deletions(-)


Instalar todas las librerías necesarias:

In [None]:
# Install required packages
!pip install pandas
!pip install geopandas
!pip install networkx
!pip install numpy
!pip install altair
!pip install requests
!pip install typer
!pip install pretty-errors
!pip install matplotlib
!pip install sklearn

# Instalar PyTorch
!pip install torch==1.8.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html

# Instalar PyTorch Geometric
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
!pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
!pip install torch-geometric

# Instalar PyTorch Geometric Temporal
!pip install torch-geometric-temporal

Collecting geopandas
[?25l  Downloading https://files.pythonhosted.org/packages/d7/bf/e9cefb69d39155d122b6ddca53893b61535fa6ffdad70bf5ef708977f53f/geopandas-0.9.0-py2.py3-none-any.whl (994kB)
[K     |████████████████████████████████| 1.0MB 8.3MB/s 
Collecting pyproj>=2.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/b1/72/d52e9ca81caef056062d71991b0e9b1d16af042245627c5d0e4916a36c4f/pyproj-3.0.1-cp37-cp37m-manylinux2010_x86_64.whl (6.5MB)
[K     |████████████████████████████████| 6.5MB 47.0MB/s 
[?25hCollecting fiona>=1.8
[?25l  Downloading https://files.pythonhosted.org/packages/ea/2a/404b22883298a3efe9c6ef8d67acbf2c38443fa366ee9cd4cd34e17626ea/Fiona-1.8.19-cp37-cp37m-manylinux1_x86_64.whl (15.3MB)
[K     |████████████████████████████████| 15.3MB 170kB/s 
Collecting munch
  Downloading https://files.pythonhosted.org/packages/cc/ab/85d8da5c9a45e072301beb37ad7f833cd344e04c817d97e0cc75681d248f/munch-2.5.0-py2.py3-none-any.whl
Collecting cligj>=0.5
  Downloading https

Descargar y procesar datos hasta obtener finalmente el grafo que usaremos para modelar:

In [None]:
# Note: this file can take some time to be downloaded because is 2.5 GB
%run src/preparation/download_stm_bus_data.py

In [None]:
%run src/preparation/download_bus_stops.py
%run src/preparation/download_bus_tracks.py

In [None]:
%run src/processing/process_stm_bus_data.py
%run src/processing/build_bus_line_tracks_and_stops.py
%run src/processing/sort_bus_stops_along_bus_track.py

In [None]:
%run src/processing/build_adyacency_matrix.py
%run src/processing/build_graph.py

# Análisis Exploratorio de Datos

In [None]:
import geopandas as gpd
import networkx as nx
import pandas as pd
from notebooks.eda.plots import (
    plot_boardings_by_day_name,
    plot_boardings_by_hour_and_day_name,
    plot_boardings_by_time,
)
from src.preparation.constants import BUFFER, BUS_LINES, CRS, DAY_NAME_MAPPING, PROCESSED_FILE
from src.preparation.utils import (
    load_pickle_file,
    load_spatial_data,
    load_stm_bus_data,
    load_stm_bus_line_track,
    load_stm_bus_stops,
    save_pickle_file,
)
from src.processing.process_stm_bus_data import pre_process_data
from src.processing.utils import (
    build_adyacency_matrix,
    build_bus_line_tracks_and_stops,
    fix_bus_stop_order,
)


In [None]:
# Load data and pre-process data
df = pre_process_data(load_stm_bus_data())

In [None]:
df_hourly = df.groupby([pd.Grouper(freq="1H")])["cantidad_pasajeros"].sum().reset_index()
plot_boardings_by_time(df_hourly)

In [None]:
# Daily sum of boardings and median by day name
df_day_name = (
    df.groupby([pd.Grouper(freq="1D"), "nombre_dia"])["cantidad_pasajeros"]
    .sum()
    .groupby(["nombre_dia"])
    .median()
    .reset_index()
)
plot_boardings_by_day_name(df_day_name)

In [None]:
# Hourly sum of boardings and median by day name
df_hourly_day_name = df_hourly.copy()
df_hourly_day_name.set_index("fecha_evento", inplace=True)
df_hourly_day_name.loc[:, "nombre_dia"] = df_hourly_day_name.index.day_name()
df_hourly_day_name.loc[:, "nombre_dia"].replace(DAY_NAME_MAPPING, inplace=True)
df_hourly_day_name.loc[:, "hora"] = df_hourly_day_name.index.hour
df_hourly_day_name = df_hourly_day_name.groupby(["hora", "nombre_dia"]).median().reset_index()

plot_boardings_by_hour_and_day_name(df_hourly_day_name)

In [None]:
# Get top buses lines per day of the week
df_weekly_by_day_name_and_line = (
    df.groupby([pd.Grouper(freq="1D"), "nombre_dia", "dsc_linea"])["cantidad_pasajeros"]
    .sum()
    .groupby(["dsc_linea", "nombre_dia"])
    .median()
    .groupby("dsc_linea")
    .sum()
    .sort_values(ascending=False)
    .reset_index()
)

df_weekly_by_day_name_and_line["decile_rank"] = pd.qcut(
    df_weekly_by_day_name_and_line["cantidad_pasajeros"], 10, labels=False
)

In [None]:
# Contribution of each decile
df_decile_rank_prop = df_weekly_by_day_name_and_line.groupby("decile_rank").sum().reset_index()
df_decile_rank_prop["proportion"] = (
    df_decile_rank_prop["cantidad_pasajeros"] / df_decile_rank_prop["cantidad_pasajeros"].sum()
)

# Select bus lines from the 9th decile
df_bus_lines = df_weekly_by_day_name_and_line.loc[
    df_weekly_by_day_name_and_line["decile_rank"] == 9, :
]
df_bus_lines = df_bus_lines.sort_values("cantidad_pasajeros", ascending=False)

In [None]:
# %% [markdown]
# ## Build bus line tracks

# %%
# Load processed file
df_proc = load_pickle_file(PROCESSED_FILE)

# Load bus stops
gdf_bus_stops = load_stm_bus_stops()

# Load bus tracks
gdf_bus_tracks = load_stm_bus_line_track()

# %%
# Read all bus stops by bus line from geojson files
all_bus_stops = gpd.GeoDataFrame()
for bus_line in BUS_LINES:
    all_bus_stops = all_bus_stops.append(load_spatial_data(bus_line, type="bus_stop"))
all_bus_stops = all_bus_stops.set_crs(CRS)

# Read all bus tracks by bus line from geojson files
all_bus_tracks = gpd.GeoDataFrame()
for bus_line in BUS_LINES:
    df = load_spatial_data(bus_line, type="bus_line")
    df["line"] = bus_line
    all_bus_tracks = all_bus_tracks.append(df)
all_bus_tracks = all_bus_tracks.set_crs(CRS)

# %%
# Get ordered bus stops and bus tracks from files
all_bus_stops_ordered, all_bus_tracks_ordered = gpd.GeoDataFrame(), gpd.GeoDataFrame()

for bus_line in BUS_LINES:
    df_bus_stop_ordered = load_spatial_data(bus_line, type="bus_stop_ordered")
    all_bus_stops_ordered = all_bus_stops_ordered.append(df_bus_stop_ordered)

    df_bus_track_ordered = load_spatial_data(bus_line, type="bus_track_ordered")
    df_bus_track_ordered["DESC_LINEA"] = bus_line
    all_bus_tracks_ordered = all_bus_tracks_ordered.append(df_bus_track_ordered)

all_bus_stops_ordered = all_bus_stops_ordered.set_crs(CRS)
all_bus_stops_ordered = all_bus_stops_ordered.astype({"COD_UBIC_P": "int"})
all_bus_tracks_ordered = all_bus_tracks_ordered.set_crs(CRS)

# %%
# Fix order from origin
for bus_line in BUS_LINES:
    if bus_line == "183":
        fix_bus_stop_order(bus_line, reorder=True)
    elif bus_line != "405":
        fix_bus_stop_order(bus_line)

# %%
# Check shared bus stops by lines
shared_bus_stops = (
    all_bus_stops_ordered.groupby(["COD_UBIC_P"])
    .agg(lines=("DESC_LINEA", "|".join), number_of_lines=("DESC_LINEA", len))
    .round(0)
    .sort_values("COD_UBIC_P", ascending=True)
    .reset_index()
    .astype({"COD_UBIC_P": int})
)

print(shared_bus_stops.loc[shared_bus_stops["number_of_lines"] > 1, :]["lines"].unique())


# %%
# Get distances for shared bus stations
shared = shared_bus_stops.loc[shared_bus_stops["number_of_lines"] > 1, :][["COD_UBIC_P", "lines"]]

for bus_stop, bus_lines in zip(shared["COD_UBIC_P"], shared["lines"]):
    bus_lines = bus_lines.split("|")
    for bus_line in bus_lines:
        bus_stops_by_line = all_bus_stops_ordered.loc[
            (all_bus_stops_ordered["DESC_LINEA"] == bus_line) & (all_bus_stops_ordered["COD"]), :
        ]
        bus_tracks_by_line = all_bus_tracks_ordered.loc[
            all_bus_tracks_ordered["DESC_LINEA"] == bus_line, :
        ]

# %%
# Build adyacency matrix
# df_adyacency_matrix, df_from_to_weight = build_adyacency_matrix(control=True)
df_adyacency_matrix = pd.read_csv("data/processed/adyacency_matrix.csv", index_col=0)
df_adyacency_matrix.columns = df_adyacency_matrix.columns.astype(int)

df_from_to_weight = pd.read_csv("data/processed/from_to_weight.csv", index_col=0)

# %%
# Check adyacency matrix
bus_stops_103 = all_bus_stops_ordered.loc[all_bus_stops_ordered["DESC_LINEA"] == "103", :]
bus_stops_list = bus_stops_103["COD_UBIC_P"].unique()
dist_between_stops = list()

for i in range(0, (len(bus_stops_list) - 1)):
    bus_stop_start = bus_stops_103.loc[i, "COD_UBIC_P"]
    bus_stop_end = bus_stops_103.loc[(i + 1), "COD_UBIC_P"]
    d = df_adyacency_matrix.loc[bus_stop_start, bus_stop_end]
    dist_between_stops.append(d)

print(dist_between_stops)


# %%
G = nx.from_pandas_edgelist(
    df_from_to_weight, source="from", target="to", edge_attr="weight", create_using=nx.DiGraph
)
G.name = "Bus lines of Montevideo"
print(nx.info(G))

# %%
layout = nx.spring_layout(G)
nx.draw(G, layout, with_labels=True)


# %%
# Build directed graph from A. matrix
G = nx.from_pandas_adjacency(df_adyacency_matrix, create_using=nx.DiGraph)
G.name = "Graph of bus lines of Montevideo"
print(nx.info(G))

# %%
A = df_adyacency_matrix.values
G = nx.from_numpy_array(A, parallel_edges=False, create_using=nx.DiGraph)
G.name = "Graph of main buses lines of Montevideo"
print(nx.info(G))


# %%
df = pd.DataFrame([[0, 0, 0], [1, 0, 0], [0, 1, 0]])
print(df)
A = df.values
# G = nx.from_numpy_array(A, parallel_edges=True, create_using=nx.DiGraph())

# G = nx.from_pandas_adjacency(df, create_using=nx.Graph)
G.name = "Graph from pandas adjacency matrix"
print(nx.info(G))
