# Create powerful data insights and visualization :  
# Paris public renting bikes data 

In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
import cleaning_data

In [2]:
#df = cleaning_data.cleaning( 'allData.csv') 
df = pd.read_csv( 'cleanedData.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,station_id,name,lat,lon,capacity,stationCode_x,rental_methods,timestamp,datetime,num_bikes_available,num_docks_available,is_installed,is_returning,is_renting,last_reported,mechanical_available,ebike_available
0,0,213688169,Benjamin Godard - Victor Hugo,48.865983,2.275725,35,16107,,1.586873e+09,2020-04-14 15:59:32,11,24,1,1,1,1586870837,3,8
1,1,99950133,André Mazet - Saint-André des Arts,48.853756,2.339096,55,6015,['CREDITCARD'],1.586873e+09,2020-04-14 15:59:32,26,27,1,1,1,1586870920,25,1
2,2,516709288,Charonne - Robert et Sonia Delauney,48.855908,2.392571,20,11104,,1.586873e+09,2020-04-14 15:59:32,8,12,1,1,1,1586871146,5,3
3,3,36255,Toudouze - Clauzel,48.879296,2.337360,21,9020,['CREDITCARD'],1.586873e+09,2020-04-14 15:59:32,12,9,1,1,1,1586871022,7,5
4,4,37815204,Mairie du 12ème,48.840855,2.387555,30,12109,,1.586873e+09,2020-04-14 15:59:32,10,20,1,1,1,1586870873,7,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
426604,528816,27415004,Général Michel Bizot - Claude Decaen,48.834813,2.400928,38,12039,['CREDITCARD'],1.587982e+09,2020-04-27 12:11:27,29,9,1,1,1,1587982237,26,3
426605,528817,27415128,Ivry - Baudricourt,48.824696,2.363106,20,13037,,1.587982e+09,2020-04-27 12:11:27,5,15,1,1,1,1587981719,3,2
426606,528818,27414937,Saint-Mandé - Docteur Arnold Netter,48.844626,2.404946,39,12017,['CREDITCARD'],1.587982e+09,2020-04-27 12:11:27,34,5,1,1,1,1587982212,22,12
426607,528819,66507230,Saint-Marcel - Hôpital,48.839504,2.360989,21,13013,['CREDITCARD'],1.587982e+09,2020-04-27 12:11:27,15,6,1,1,1,1587982228,14,1


## Adding new meaningfull data
### datetime

In [None]:
#Convert the datetime column in datetime.datetime class
df['datetime'] = df['datetime'].apply( lambda x : datetime.strptime( x, '%Y-%m-%d %H:%M:%S'))

In [55]:
# Extract the day 
df['day'] = df[ 'datetime'].apply( lambda x : x.day ) 

In [82]:
# Convert into weekdays
df['weekday'] = df[ 'datetime'].apply( lambda x : x.weekday() )
df['weekday'] = df['weekday'].map( 
    { 0:'Mon', 1:'Tue', 2:'Wed', 3:'Thu', 4:'Fri', 5:'Sat', 6:'Sun'}
    )

It has to be noticed that the stations' data are __updated once an hour__ by the data source.
It means that even if the extract program is run several times in an hour, the data gathered are the same


In order to simplify the further analysis, it is more convinient to __round the hours__ of the day 

In [67]:
# New column with rounded hours 
df['hour_r'] = df["datetime"].dt.round("H").dt.hour

In [84]:
df.head(1)

Unnamed: 0,station_id,name,lat,lon,capacity,stationCode_x,rental_methods,timestamp,datetime,num_bikes_available,num_docks_available,is_installed,is_returning,is_renting,last_reported,mechanical_available,ebike_available,day,hour_r,weekday
0,213688169,Benjamin Godard - Victor Hugo,48.865983,2.275725,35,16107,,1586872772.0,2020-04-14 15:59:32,11,24,1,1,1,1586870837,3,8,14,16,Tue


### Location 
We will use the __geopy package__ to convert the _lat_ and _lon_ data into an understandable adress  
First we create a different dataframe to have all the stations' info in a simple dataframe

In [None]:
# stations_loc = df[ ['station_id', 'name', 'lat', 'lon', 'station_id', 'capacity', 'is_installed', 'is_returning', 'is_renting']].drop_duplicates(subset= 'station_id')

# Syntax understandable for geopy.geocoders methods :
stations_loc['lat_lon'] = stations_loc['lat'] + ", " + stations_loc['lon']

In [149]:
stations_loc.head(1)

Unnamed: 0,station_id,name,lat,lon,lat_lon,address_raw,municipality,postcode,cityDistrict,capacity,is_installed,is_returning,is_renting
0,213688169,Benjamin Godard - Victor Hugo,48.865983,2.275725,"48.865983, 2.275725","{'buildingNumber': '2', 'streetNumber': '2', '...",Paris,75116,16,35,1,1,1


In [28]:
from geopy.geocoders import TomTom

# With the own geolocation service TomTom class :
geolocator = TomTom( api_key =  'XtY5TGp0LMPPoMrmgoOWAT06FihLGbbG')

In [118]:
#Exemple
geolocator.reverse("48.865983, 2.275725")

Location(2 Rue Benjamin Godard, Paris, 75116, (48.865936, 2.275709, 0.0))

In [34]:
# To limit the number of query :
# add delay between geocoding calls to reduce the load on the Geocoding service
# https://geopy.readthedocs.io/en/stable/#usage-with-pandas
from geopy.extra.rate_limiter import RateLimiter
# Set 0.5s between each call
geocode = RateLimiter( geolocator.reverse, min_delay_seconds = 1 )

In [203]:
stations_loc['address_raw'] = stations_loc['lat_lon'].apply(
                                lambda x: geocode(x).raw ) # attribute to obtain a dictionnary 

In [None]:
def district(x):
    # If in Paris return the disctrict of paris 
    if x['postalCode'][0:2]=='75': 
        return x['postalCode'][3:5]
    # otherwise return the city name
    else: return x['municipality']

In [84]:
# Split raw address dictionnaries
stations_loc[ 'municipality'] = stations_loc['address_raw'].apply( lambda x: x['municipality'])
stations_loc[ 'postcode'] = stations_loc['address_raw'].apply( lambda x: x['postalCode'])
stations_loc[ 'cityDistrict'] =  stations_loc['address_raw'].apply( district)

**Save or import the stations_info CSV**

In [151]:
#Save to CSV file
stations_loc.to_csv('stations_info.csv')
#stations_loc = pd.read_csv( 'stations_info.csv')

## General information 
Contextualize the dataset in terms of date and number of bikes

#### Period

In [54]:
#Start Date 
start = df['datetime'].min() ;

14

In [72]:
# End date 
end = df['datetime'].max() ; end

Timestamp('2020-04-27 12:11:27')

In [39]:
#Period 
period = end- start ; period

Timedelta('12 days 20:11:55')