In [1]:
# import the necessary packages

import os
import json
import time
import folium
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


#ignore warnings
import warnings
warnings.filterwarnings('ignore')


pd.set_option('display.max_columns', None)

# Gather the Data


Columns :  
Code de la station                         int64  
Nom de la station                          str  
Etat des stations                          str  
Etat du Totem                              bool  
Nombres de bornes en station               int64  
Nombre de bornes disponibles               int64  
Nombre de vélo mécanique                   int64  
Nombre vélo électrique                     int64  
Achat possible en station (CB)             bool  
PARK + activation                          bool    
Nombre vélo en PARK+                       int64   
geo                                        str (Geographic coordinate system)  
Request time                               datetime64  

voir Modèle de données sur https://opendata.paris.fr/explore/dataset/velib-disponibilite-en-temps-reel/information/


In [12]:
%%time

# set the endpoint API URL
url = "https://opendata.paris.fr/explore/dataset/velib-disponibilite-en-temps-reel/download/?format=csv&timezone=Europe/Berlin&use_labels_for_header=true&csv_separator=%3B"
    
df = pd.read_csv(url, delimiter=";" )

# Track request_time for comparison with record_timestamp
request_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

df["Request time"] = request_time

print("[INFO] Done.")

[INFO] Done.
Wall time: 1.02 s


In [13]:
df.head()

Unnamed: 0,Code de la station,Nom de la station,Etat des stations,Etat du Totem,Nombres de bornes en station,Nombre de bornes disponibles,Nombre de vélo mécanique,Nombre vélo électrique,Achat possible en station (CB),PARK + activation,Nombre vélo en PARK+,geo,Request time
0,16107,Benjamin Godard - Victor Hugo,Operative,yes,35,35,0,0,no,no,0,"48.865983,2.275725",2020-02-10 11:48:40
1,6015,André Mazet - Saint-André des Arts,Operative,yes,52,42,5,5,yes,no,0,"48.8537558106,2.33909580857",2020-02-10 11:48:40
2,11104,Charonne - Robert et Sonia Delauney,Operative,yes,20,15,1,3,no,no,0,"48.855907556,2.39257067442",2020-02-10 11:48:40
3,9020,Toudouze - Clauzel,Operative,yes,21,11,6,3,yes,no,0,"48.8792959173,2.33736008406",2020-02-10 11:48:40
4,12109,Mairie du 12ème,Operative,yes,30,19,5,6,no,no,0,"48.8408553118,2.38755494356",2020-02-10 11:48:40


In [14]:
df.tail()

Unnamed: 0,Code de la station,Nom de la station,Etat des stations,Etat du Totem,Nombres de bornes en station,Nombre de bornes disponibles,Nombre de vélo mécanique,Nombre vélo électrique,Achat possible en station (CB),PARK + activation,Nombre vélo en PARK+,geo,Request time
1393,6009,Guynemer - Jardin du Luxembourg,Operative,yes,36,23,6,7,yes,no,0,"48.8466127,2.3325478",2020-02-10 11:48:40
1394,26002,Redoute - Les Courtilles,Work in progress,no,0,0,0,0,no,no,0,"48.9300297447,2.28401234155",2020-02-10 11:48:40
1395,26003,Argenteuil - Voltaire,Work in progress,no,0,0,0,0,no,no,0,"48.91872207,2.2814373672",2020-02-10 11:48:40
1396,15201,Porte de la plaine - Lefebvre,Work in progress,no,0,0,0,0,no,no,0,"48.8306173775,2.29206342083",2020-02-10 11:48:40
1397,42706,Redoute - Pierre de Coubertin,Work in progress,no,0,0,0,0,no,no,0,"48.8146570818,2.36120063106",2020-02-10 11:48:40


In [15]:
print('The data has {} Rows and {} columns'.format(df.shape[0],df.shape[1]))

The data has 1398 Rows and 13 columns


In [16]:
print("The types of columns are:")
df.dtypes

The types of columns are:


Code de la station                 int64
Nom de la station                 object
Etat des stations                 object
Etat du Totem                     object
Nombres de bornes en station       int64
Nombre de bornes disponibles       int64
Nombre de vélo mécanique           int64
Nombre vélo électrique             int64
Achat possible en station (CB)    object
PARK + activation                 object
Nombre vélo en PARK+               int64
geo                               object
Request time                      object
dtype: object

In [18]:
# converting datatypes 
# df = df.infer_objects()
df['Request time'] = pd.to_datetime(df['Request time'], format="%Y-%m-%d %H:%M:%S")
df.dtypes

Code de la station                         int64
Nom de la station                         object
Etat des stations                         object
Etat du Totem                             object
Nombres de bornes en station               int64
Nombre de bornes disponibles               int64
Nombre de vélo mécanique                   int64
Nombre vélo électrique                     int64
Achat possible en station (CB)            object
PARK + activation                         object
Nombre vélo en PARK+                       int64
geo                                       object
Request time                      datetime64[ns]
dtype: object

### Extract Descriptive Statistics of Each Column

In [19]:
def num_missing(x):
    return len(x.index)-x.count()

def num_unique(x):
    return len(np.unique(x))

temp_df = df.describe().T
missing_df = pd.DataFrame(df.apply(num_missing, axis=0)) 
missing_df.columns = ['missing']
unq_df = pd.DataFrame(df.apply(num_unique, axis=0))
unq_df.columns = ['unique']
types_df = pd.DataFrame(df.dtypes)
types_df.columns = ['DataType']

In [20]:
# Print the descriptive statistics of numerical columns
summary_df = temp_df.join(missing_df).join(unq_df).join(types_df)
summary_df

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing,unique,DataType
Code de la station,1398.0,18065.404149,12071.030887,1001.0,11004.5,16011.5,21204.75,92008.0,0,1398,int64
Nombres de bornes en station,1398.0,25.490701,15.843801,0.0,18.0,25.0,35.0,73.0,0,69,int64
Nombre de bornes disponibles,1398.0,17.473534,13.929738,0.0,4.0,17.0,26.0,67.0,0,65,int64
Nombre de vélo mécanique,1398.0,5.679542,6.641146,0.0,1.0,3.0,9.0,40.0,0,38,int64
Nombre vélo électrique,1398.0,3.444921,3.52019,0.0,1.0,2.0,5.0,26.0,0,24,int64
Nombre vélo en PARK+,1398.0,0.036481,0.364042,0.0,0.0,0.0,0.0,12.0,0,4,int64


In [22]:
# Print the descriptive statistics of categorical columns

cat_index = ['Code de la station', 'Nom de la station', 'Etat des stations',
       'Etat du Totem', 'Achat possible en station (CB)', 'PARK + activation',
       'geo', 'Request time']

summary_df_cat = missing_df.join(unq_df).join(types_df.loc[cat_index], how='inner') #Only summarize categorical columns
summary_df_cat

Unnamed: 0,missing,unique,DataType
Code de la station,0,1398,int64
Nom de la station,0,1393,object
Etat des stations,0,3,object
Etat du Totem,0,2,object
Achat possible en station (CB),0,2,object
PARK + activation,0,2,object
geo,0,1398,object
Request time,0,1,datetime64[ns]


In [23]:
# pandas profiling is an amazing tool 
import pandas_profiling

profile = pandas_profiling.ProfileReport(df)
profile.to_file(output_file="report_velib.html")

### EDA
Thanks to pandas profiling, we can see that there is 1338 station operative, 45 work in progress and 15 close.

In [28]:
df.head()

Unnamed: 0,Code_de_la_station,Nom_de_la_station,Etat_des_stations,Etat_du_Totem,Nombres_de_bornes_en_station,Nombre_de_bornes_disponibles,Nombre_de_vélo_mécanique,Nombre_vélo_électrique,Achat_possible_en_station_(CB),PARK_+_activation,Nombre_vélo_en_PARK+,geo,Request_time
0,16107,Benjamin Godard - Victor Hugo,Operative,yes,35,35,0,0,no,no,0,"48.865983,2.275725",2020-02-10 11:48:40
1,6015,André Mazet - Saint-André des Arts,Operative,yes,52,42,5,5,yes,no,0,"48.8537558106,2.33909580857",2020-02-10 11:48:40
2,11104,Charonne - Robert et Sonia Delauney,Operative,yes,20,15,1,3,no,no,0,"48.855907556,2.39257067442",2020-02-10 11:48:40
3,9020,Toudouze - Clauzel,Operative,yes,21,11,6,3,yes,no,0,"48.8792959173,2.33736008406",2020-02-10 11:48:40
4,12109,Mairie du 12ème,Operative,yes,30,19,5,6,no,no,0,"48.8408553118,2.38755494356",2020-02-10 11:48:40


In [27]:
velib_op = df[df['Etat_des_stations']=='Operative']

# Available bikes

In [29]:
from colour import Color

red = Color("red")
colors = list(red.range_to(Color("green").hex,10))
def red(brightness):
    brightness = int(round(9 * brightness)) # convert from 0.0-1.0 to 0-255
    return colors[brightness]

In [42]:
# Extract latitude and longitude from geo
velib_op['lat'] = velib_op['geo'].apply(lambda x: float(x.split(',')[0]))
velib_op['long'] = velib_op['geo'].apply(lambda x: float(x.split(',')[1]))

In [44]:
velib_op.head()

Unnamed: 0,Code_de_la_station,Nom_de_la_station,Etat_des_stations,Etat_du_Totem,Nombres_de_bornes_en_station,Nombre_de_bornes_disponibles,Nombre_de_vélo_mécanique,Nombre_vélo_électrique,Achat_possible_en_station_(CB),PARK_+_activation,Nombre_vélo_en_PARK+,geo,Request_time,lat,long,available_velib,Available_velib
0,16107,Benjamin Godard - Victor Hugo,Operative,yes,35,35,0,0,no,no,0,"48.865983,2.275725",2020-02-10 11:48:40,48.865983,2.275725,0,0
1,6015,André Mazet - Saint-André des Arts,Operative,yes,52,42,5,5,yes,no,0,"48.8537558106,2.33909580857",2020-02-10 11:48:40,48.853756,2.339096,10,10
2,11104,Charonne - Robert et Sonia Delauney,Operative,yes,20,15,1,3,no,no,0,"48.855907556,2.39257067442",2020-02-10 11:48:40,48.855908,2.392571,4,4
3,9020,Toudouze - Clauzel,Operative,yes,21,11,6,3,yes,no,0,"48.8792959173,2.33736008406",2020-02-10 11:48:40,48.879296,2.33736,9,9
4,12109,Mairie du 12ème,Operative,yes,30,19,5,6,no,no,0,"48.8408553118,2.38755494356",2020-02-10 11:48:40,48.840855,2.387555,11,11


In [45]:
# Number of available velib
velib_op['Available_velib'] = velib_op.apply(lambda x: x['Nombre_de_vélo_mécanique'] + x['Nombre_vélo_électrique'], axis=1)


In [46]:
from folium.plugins import MarkerCluster

m = folium.Map(
    location=[48.85, 2.35],
    tiles='Cartodb Positron',
    zoom_start=11
)

marker_cluster = MarkerCluster(
    name='Velib clustered',
    overlay=True,
    control=False,
    icon_create_function=None
)

for k, v in velib_op.iterrows():
    location = v.lat, v.long
    marker = folium.Marker(location=location)
    popup = 'Station:{}<br>Number of Velibs:{}<br>Number of available docks:{}'.format(v.Nom_de_la_station, v.Available_velib, v.Nombre_de_bornes_disponibles)
    folium.Popup(popup).add_to(marker)
    marker_cluster.add_child(marker)

marker_cluster.add_to(m)

m

In [48]:
from folium.plugins import HeatMap
import branca.colormap as cm

m = folium.Map([48.85, 2.35], tiles='Cartodb Positron', zoom_start=11)

HeatMap(
    velib_op[['lat','long', 'Available_velib']].values
).add_to(m)

cm.LinearColormap(
    ['blue', 'green',  'yellow', 'orange', 'red'],
    index=np.linspace(0, 1, 5)
).scale(0, 350).add_to(m)

m