In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import geopy

import geopandas as gpd
from shapely.geometry import Point

In [2]:
#La ou est située le fichier train.csv sur votre ordinateur 
path = "/Users/gilles/Documents/Entretiens_data_2020/My_Traffic/"

In [3]:
train = pd.read_csv(path+"train.csv")[:1000]
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [4]:
np.shape(train)

(1000, 11)

In [5]:
train.to_csv("train_sample.csv")

## 1. Etude préalable

In [27]:
train.isnull().sum()

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64

In [28]:
train.dtypes

id                     object
vendor_id               int64
pickup_datetime        object
dropoff_datetime       object
passenger_count         int64
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
store_and_fwd_flag     object
trip_duration           int64
dtype: object

In [29]:
train.trip_duration.describe()

count     1000.000000
mean       924.104000
std       2721.217211
min          3.000000
25%        414.000000
50%        672.000000
75%       1074.250000
max      84594.000000
Name: trip_duration, dtype: float64

In [30]:
#On considère qu'un trajet ne peut pas durer plus de 24h
duree_max = 24*60*60
print(duree_max)

86400


## 2. Réponses aux question

On cherche à calculer les indicateurs ci-dessous :
- la vitesse moyenne de chaque trajet,
- le nombre de trajets effectués en fonction du jour de la semaine,
- le nombre de trajets effectués en fonction de l’horaire de la journée par tranche de 4h,
- le nombre de km parcourus par jour de la semaine.

### 2.1 Vitesse moyenne de chaque trajet

Pour avoir la vitesse moyenne de chaque trajet, il nous faut calculer la distance de chaque trajet à partir des coordonnées de départ et d'arrivée.

On va estimer à la baisse cette distance en calculant seulement la distance à vol d'oiseau entre le point de départ et le point d'arrivée. Pour ce faire on va utiliser la distance haversine.

#### 2.1.1 Calcul de la distance (au moins 2min)

In [31]:
#Il s'agit de la distance great cricle / alternative à la geodesic 
def haversine_distance(lat1, lon1, lat2, lon2):
    r = 6371
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) *   np.sin(delta_lambda / 2)**2
    res = r * (2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)))
    return np.round(res,5)*1000

In [33]:
haversine_distance(train["pickup_latitude"].iloc[0], train["pickup_longitude"].iloc[0], train["dropoff_latitude"].iloc[0], train["dropoff_longitude"].iloc[0] )

1498.52

In [32]:
def speed_estimate(df):
    
    #On crée d'abord la variable distance
    df["distance"] = df.apply(lambda x: haversine_distance(x["pickup_latitude"], x["pickup_longitude"], x["dropoff_latitude"], x["dropoff_longitude"]), axis = 1)
    df["speed"] = (df["distance"] / df["trip_duration"]) * 3.6
    
    return df

In [35]:
train_1 = speed_estimate(train)
train_1.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,distance,speed
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,1498.52,11.856422
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,1805.51,9.803674
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,6385.1,10.822203
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,1485.5,12.465734
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,1188.59,9.836607


In [36]:
train_1.speed.describe()

count    1000.000000
mean       14.434033
std         7.959612
min         0.000000
25%         8.963443
50%        12.770952
75%        18.324439
max        64.095727
Name: speed, dtype: float64

### 2.2 Nombre de trajets en fonction du jour de la semaine

In [45]:
#import calendar
import datetime

In [46]:
calendar.mdays

[0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

In [None]:
#ON se base plutot sur Pickup datetime pour le nombre de trajets

In [47]:
train.iloc[0]["pickup_datetime"]

'2016-03-14 17:24:55'

In [55]:
train["pickup_datetime"].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') if type(x) == str else x)

0     2016-03-14 17:24:55
1     2016-06-12 00:43:35
2     2016-01-19 11:35:24
3     2016-04-06 19:32:31
4     2016-03-26 13:30:55
              ...        
995   2016-05-20 10:53:52
996   2016-01-26 02:01:09
997   2016-03-17 11:51:09
998   2016-02-24 16:25:29
999   2016-02-13 21:41:23
Name: pickup_datetime, Length: 1000, dtype: datetime64[ns]

In [49]:
date_time_obj = datetime.datetime.strptime(train.iloc[0]["pickup_datetime"], '%Y-%m-%d %H:%M:%S')

In [64]:
ISOWeekDays = ("NoZeroInISOWeek","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")

ISOWeekDays[date_time_obj.isoweekday()]

'Monday'

In [67]:
def weekday(df):
    
    ISOWeekDays = ("NoZeroInISOWeek","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")
    
    df["weekday"] = df["pickup_datetime"].apply(lambda x: ISOWeekDays[datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').isoweekday()] if type(x) == str else x)
    
    return df 
    

In [68]:
train = weekday(train)
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,distance,speed,weekday
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,1498.52,11.856422,Monday
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,1805.51,9.803674,Sunday
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,6385.1,10.822203,Tuesday
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,1485.5,12.465734,Wednesday
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,1188.59,9.836607,Saturday


In [75]:
train.weekday.value_counts().to_dict()

{'Friday': 152,
 'Saturday': 151,
 'Thursday': 142,
 'Tuesday': 141,
 'Wednesday': 140,
 'Sunday': 138,
 'Monday': 136}

### 2.3 Nombre de trajets en fonction de l'horaire de la journée (par tranche de 4h)



In [85]:
date_time_obj = datetime.datetime.strptime(train.iloc[0]["pickup_datetime"], '%Y-%m-%d %H:%M:%S')
date_time_obj.hour

17

In [88]:
def check_hour(x):
    if 0 <= x < 4:
        return '0-4'
    elif 4 <= x < 8:
        return '4-8'  
    elif 8 <= x < 12:
        return '8-12'
    elif 12 <= x < 16:
        return '12-16'
    elif 16 <= x < 20:
        return '16-20'
    else:
        return '20-24'

In [93]:
def hour(df):
    
    df["hour"] =  df["pickup_datetime"].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').hour if type(x) == str else x)
    
    return (df)

In [94]:
train = hour(train)
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,distance,speed,weekday,hour
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,1498.52,11.856422,Monday,17
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,1805.51,9.803674,Sunday,0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,6385.1,10.822203,Tuesday,11
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,1485.5,12.465734,Wednesday,19
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,1188.59,9.836607,Saturday,13


In [96]:
train["hour_4"] = train["hour"].apply(lambda x: check_hour(x))

In [98]:
train.hour_4.value_counts().to_dict()

{'16-20': 238, '20-24': 211, '8-12': 190, '12-16': 189, '0-4': 94, '4-8': 78}

### 2.4 Nombre de km parcourus par jour de la semaine

In [81]:
# Il faut à la fois récupérer la distance parcourue et le jour de la semaine
(train.groupby("weekday")["distance"].sum()/10000).to_dict()

{'Friday': 46.59824000000001,
 'Monday': 48.06478500000001,
 'Saturday': 48.935283999999996,
 'Sunday': 55.711375,
 'Thursday': 42.71878700000001,
 'Tuesday': 46.61376400000002,
 'Wednesday': 55.328784000000006}

La limitation est que si un trajet a commencé juste avant minuit et qu'il s'est arrêté après minuit, je compte toutes la distance parcoure pour le jour d'avant 