# Create powerful data insights and visualization :  
# Paris public renting bikes data 

In [29]:
import pandas as pd
import numpy as np

from datetime import datetime
import cleaning_data 

In [5]:
df = cleaning_data.cleaning( 'allData.csv') 

In [10]:
df

Unnamed: 0,station_id,name,lat,lon,capacity,stationCode_x,rental_methods,timestamp,datetime,num_bikes_available,num_docks_available,is_installed,is_returning,is_renting,last_reported,mechanical_available,ebike_available
0,213688169,Benjamin Godard - Victor Hugo,48.865983,2.275725,35,16107,,1586872772.0,2020-04-14 15:59:32,11,24,1,1,1,1586870837,3,8
1,99950133,André Mazet - Saint-André des Arts,48.85375581057431,2.3390958085656166,55,6015,['CREDITCARD'],1586872772.0,2020-04-14 15:59:32,26,27,1,1,1,1586870920,25,1
2,516709288,Charonne - Robert et Sonia Delauney,48.85590755596891,2.3925706744194035,20,11104,,1586872772.0,2020-04-14 15:59:32,8,12,1,1,1,1586871146,5,3
3,36255,Toudouze - Clauzel,48.87929591733507,2.3373600840568547,21,9020,['CREDITCARD'],1586872772.0,2020-04-14 15:59:32,12,9,1,1,1,1586871022,7,5
4,37815204,Mairie du 12ème,48.84085531176338,2.3875549435615544,30,12109,,1586872772.0,2020-04-14 15:59:32,10,20,1,1,1,1586870873,7,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
528816,27415004,Général Michel Bizot - Claude Decaen,48.83481262982544,2.400927788127704,38,12039,['CREDITCARD'],1587982287.0,2020-04-27 12:11:27,29,9,1,1,1,1587982237,26,3
528817,27415128,Ivry - Baudricourt,48.824695995415006,2.363106073577468,20,13037,,1587982287.0,2020-04-27 12:11:27,5,15,1,1,1,1587981719,3,2
528818,27414937,Saint-Mandé - Docteur Arnold Netter,48.84462585769668,2.404946386814118,39,12017,['CREDITCARD'],1587982287.0,2020-04-27 12:11:27,34,5,1,1,1,1587982212,22,12
528819,66507230,Saint-Marcel - Hôpital,48.83950442458303,2.360989417022932,21,13013,['CREDITCARD'],1587982287.0,2020-04-27 12:11:27,15,6,1,1,1,1587982228,14,1


## Adding new meaningfull data
### datetime

In [None]:
#Convert the datetime column in datetime.datetime class
df['datetime'] = df['datetime'].apply( lambda x : datetime.strptime( x, '%Y-%m-%d %H:%M:%S'))

In [55]:
# Extract the day 
df['day'] = df[ 'datetime'].apply( lambda x : x.day ) 

In [82]:
# Convert into weekdays
df['weekday'] = df[ 'datetime'].apply( lambda x : x.weekday() )
df['weekday'] = df['weekday'].map( 
    { 0:'Mon', 1:'Tue', 2:'Wed', 3:'Thu', 4:'Fri', 5:'Sat', 6:'Sun'}
    )

It has to be noticed that the stations' data are __updated once an hour__ by the data source.
It means that even if the extract program is run several times in an hour, the data gathered are the same


In order to simplify the further analysis, it is more convinient to __round the hours__ of the day 

In [67]:
# New column with rounded hours 
df['hour_r'] = df["datetime"].dt.round("H").dt.hour

In [84]:
df.head(1)

Unnamed: 0,station_id,name,lat,lon,capacity,stationCode_x,rental_methods,timestamp,datetime,num_bikes_available,num_docks_available,is_installed,is_returning,is_renting,last_reported,mechanical_available,ebike_available,day,hour_r,weekday
0,213688169,Benjamin Godard - Victor Hugo,48.865983,2.275725,35,16107,,1586872772.0,2020-04-14 15:59:32,11,24,1,1,1,1586870837,3,8,14,16,Tue


### Location 
We will use the __geopy package__ to convert the _lat_ and _lon_ data into an understandable adress

In [187]:
from geopy.geocoders import TomTom

# With the own geolocation service TomTom class :
geolocator = TomTom( api_key =  'XtY5TGp0LMPPoMrmgoOWAT06FihLGbbG')

{'address29': 'Benjamin Godard - Victor Hugo', 'road': 'Rue Benjamin Godard', 'suburb': 'Quartier de la Porte-Dauphine', 'city': 'Paris', 'municipality': 'Paris', 'county': 'Paris', 'state': 'Île-de-France', 'country': 'France', 'postcode': '75116', 'country_code': 'fr'}


In [None]:
geolocator.reverse("48.865983, 2.275725").raw

In [201]:
# To limit the number of query : 
# add delay between geocoding calls to reduce the load on the Geocoding service
# https://geopy.readthedocs.io/en/stable/#usage-with-pandas
from geopy.extra.rate_limiter import RateLimiter
# Set 0.5s between each call 
geocode = RateLimiter( geolocator2.reverse, min_delay_seconds = 1 )

In [202]:
geocode("48.865983, 2.275725")

Location(2 Rue Benjamin Godard, Paris, 75116, (48.865936, 2.275709, 0.0))

Creating a station reference dataframe (to limit the time) 

In [130]:
stations_loc = df[ ['station_id', 'name', 'lat', 'lon']].drop_duplicates(subset= 'station_id')
# Syntax understandable for geopy.
stations_loc['lat_lon'] = stations_loc['lat'] + ", " + stations_loc['lon']

In [203]:
stations_loc['address_raw'] = stations_loc['lat_lon'].apply( lambda x: geocode(x).raw['address'] )

In [205]:
stations_loc.to_csv(' stations_info.csv ')

## General information 
Contextualize the dataset in terms of date and number of bikes

#### Period

In [54]:
#Start Date 
start = df['datetime'].min() ;

14

In [72]:
# End date 
end = df['datetime'].max() ; end

Timestamp('2020-04-27 12:11:27')

In [39]:
#Period 
period = end- start ; period

Timedelta('12 days 20:11:55')

### @