# Network analysis of interstate bus lines in Brazil
## Dados Abertos ANTT

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon
import string
import geopy
from geopy import distance
import unidecode
import networkx as nx
import graphviz as gv
import pygraphviz as pgv
import pydot

%matplotlib inline

In [582]:
# Importing data of the bus routes and municipalities codes and GPS coord

data_bus = pd.read_csv('data_bus_v4.csv')

data_bus = data_bus.drop("Unnamed: 0",axis=1)

municipios_ibge = pd.read_csv('municipiosibge.csv')

## The making of the graph

There are more than one possible representantion of the bus lines network, with directed graph or multi directed graph. We're going to use directed graph as it's possible to represent both ways of bus lines within the structure of the matrix. 

Let $G$ represent the connections between cities present in our bus lines database, that is, cities that are either origin or destination for interstate bus lines. A value (weight) $g_{i,j}$  may represent the quantity of travelling done from location $i$ to location $j$.


For this, we're going to create a dataframe with columns and rows representing the cities with interstate lines, which in our graph will be the nodes, and in each $g_{i,j}$ there will be the quantity of times that a given line from $i$ to $j$ was travelled.

In [4]:
# Creating a list with all cities in our database

cidades = set()

for i in data_bus.origem:
    if i in cidades:
        pass
    else:
        cidades.add(i)
        
for i in data_bus.destino:
    if i in cidades:
        pass
    else:
        cidades.add(i)


cidades = sorted(list(cidades))

matrix_quantity = pd.DataFrame(columns = cidades, index=cidades)

for i in cidades:
    matrix_quantity[i][i] = 0

origem_destino = dict(data_bus.groupby(["origem","destino"]).destino.count()) 

    
for i in cidades:
    for j in cidades:
        try:
            matrix_quantity[i][j] = float(origem_destino[(j,i)])
        except LookupError:
            matrix_quantity[i][j] = 0

# Test to see whether all values are zero. If they are, there's something wrong.

np.all(matrix_quantity==0)


False

The easiest way is to transform the dataframe into a matrix, as it's the most natural representation in our case.

In [6]:
# Shape the DataFrame matrix_viagens to a NumPy matrix

matrix_quantity = np.asmatrix(matrix_quantity.to_numpy()).astype('float64')

# We had to change the dtype to 'float64' otherwise it would be dtype('O') and networkx would be confused over it.

matrix_quantity.dtype

dtype('float64')

In [37]:
# From NumPy matrix to networkx graph type
# Naming it Q because of Quantity

Q = nx.from_numpy_matrix(matrix_quantity, create_using=nx.DiGraph)

# Checks:

print(type(Q))

print(Q.number_of_nodes() == len(cidades))


<class 'networkx.classes.digraph.DiGraph'>
True


# Basic network analysis - graph properties

In [38]:
# N: nodes, K: edges

N, K = Q.order(), Q.size()
avg_deg = float(K)/N # Average number of degrees
print(avg_deg)

2.073253833049404


Each node has, in average, 2 degrees. But this isn't representative of our network, since some marginal cities are only origin or destination, while others capital cities receive and send a massive quantity of buses through the country.

For a more interesting analysis we're going to introduce the notions of indegree and outdegree. Simply, we consider indegree the number of edges coming into a node in a directed graph and outdegree the number of edges leaving a node in a directed graph. For more about this, see: https://doi.org/10.1016/B978-0-12-804452-0.00005-1. 

Within our context, indegree is then the number of bus departures and outdegree the number of bus arrivals.

In [579]:
Q_in = Q.in_degree()

Q_out = Q.out_degree()

# Histogram of # of outdegrees and indegrees

in_values = set(sorted(dict(Q_in).values()))

in_hist = [sorted(dict(Q_in).values()).count(x) for x in in_values]

out_values = set(sorted(dict(Q_out).values()))

out_hist = [sorted(dict(Q_out).values()).count(x) for x in out_values]

len(in_values)
len(in_hist)

23

In [578]:
# Plotting the indegrees and outdegrees histogram

plt.figure()
plt.plot(in_values,in_hist,'ro-')
# plt.plot(out_values,out_hist,'bv-')
plt.legend(['In-degree (arrival)','Out-degree (departure)'])
plt.xlabel('Degree')
plt.ylabel('# of nodes')
plt.title('Interstate bus lines: arrivals and departures')
# plt.savefigure('interstate_in_out.pdf')
plt.close()