In [0]:
import pandas as pd
import numpy as np
df_sample = pd.read_csv("echentillon.csv")



In [0]:
print("Nombre de lignes :", df_sample.count())
print("Nombre de colonnes :", len(df_sample.columns))


Nombre de lignes : VendorID                 1191357
tpep_pickup_datetime     1191357
tpep_dropoff_datetime    1191357
passenger_count          1124628
trip_distance            1191357
RatecodeID               1124628
store_and_fwd_flag       1124628
PULocationID             1191357
DOLocationID             1191357
payment_type             1191357
fare_amount              1191357
extra                    1191357
mta_tax                  1191357
tip_amount               1191357
tolls_amount             1191357
improvement_surcharge    1191357
total_amount             1191357
congestion_surcharge     1124628
airport_fee               413160
Airport_fee               711468
dtype: int64
Nombre de colonnes : 20


In [0]:
len(df_sample[df_sample['tpep_dropoff_datetime'] < df_sample['tpep_pickup_datetime']])

176

In [0]:
df_sample.isnull().sum()


VendorID                      0
tpep_pickup_datetime          0
tpep_dropoff_datetime         0
passenger_count           66729
trip_distance                 0
RatecodeID                66729
store_and_fwd_flag        66729
PULocationID                  0
DOLocationID                  0
payment_type                  0
fare_amount                   0
extra                         0
mta_tax                       0
tip_amount                    0
tolls_amount                  0
improvement_surcharge         0
total_amount                  0
congestion_surcharge      66729
airport_fee              778197
Airport_fee              479889
dtype: int64

In [0]:
numeric_cols = [
    "passenger_count",
    "trip_distance",
    "fare_amount",
    "extra",
    "mta_tax",
    "tip_amount",
    "tolls_amount",
    "improvement_surcharge",
    "total_amount",
    "congestion_surcharge",
    "airport_fee",
    "Airport_fee"
]
(df_sample[numeric_cols] < 0).sum()


passenger_count              0
trip_distance                0
fare_amount              13466
extra                     6111
mta_tax                  11789
tip_amount                  75
tolls_amount               804
improvement_surcharge    12223
total_amount             12263
congestion_surcharge      9921
airport_fee                299
Airport_fee               1471
dtype: int64

In [0]:
print(df_sample.columns)

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee', 'Airport_fee'],
      dtype='object')


In [0]:
# 🔹 1️⃣ Passenger count → minimum (1)
df_sample['passenger_count'] = df_sample['passenger_count'].fillna(1)

# 🔹 2️⃣ RatecodeID → standard 1
df_sample['RatecodeID'] = df_sample['RatecodeID'].fillna(1)

# 🔹 3️⃣ store_and_fwd_flag → 'N'
df_sample['store_and_fwd_flag'] = df_sample['store_and_fwd_flag'].fillna('N')

# 🔹 4️⃣ congestion_surcharge → 0
df_sample['congestion_surcharge'] = df_sample['congestion_surcharge'].fillna(0)

# 🔹 5️⃣ Fusion airport_fee et Airport_fee
    # Méthode 1 : prendre la première valeur non nulle

df_sample['airport_fee'] = df_sample['airport_fee'].fillna(df_sample['Airport_fee'])
df_sample = df_sample.drop(columns=['Airport_fee'])


# 🔹 6️⃣ Vérification des valeurs manquantes
print(df_sample.isna().sum())


VendorID                     0
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
passenger_count              0
trip_distance                0
RatecodeID                   0
store_and_fwd_flag           0
PULocationID                 0
DOLocationID                 0
payment_type                 0
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge         0
airport_fee              66729
dtype: int64


In [0]:
df_sample['airport_fee'] = df_sample['airport_fee'].fillna(0)
print(df_sample.isna().sum())



VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
congestion_surcharge     0
airport_fee              0
dtype: int64


In [0]:
df_sample['tpep_pickup_datetime'] = pd.to_datetime(df_sample['tpep_pickup_datetime'], errors='coerce')
df_sample['tpep_dropoff_datetime'] = pd.to_datetime(df_sample['tpep_dropoff_datetime'], errors='coerce')


# Vérification
print(df_sample.dtypes)

VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object


In [0]:
mask_inverted = df_sample['tpep_pickup_datetime'] > df_sample['tpep_dropoff_datetime']
mask_inverted.sum()  


np.int64(176)

In [0]:
# تخزين مؤقت
temp = df_sample.loc[mask_inverted, 'tpep_pickup_datetime']

df_sample.loc[mask_inverted, 'tpep_pickup_datetime'] = \
    df_sample.loc[mask_inverted, 'tpep_dropoff_datetime']

df_sample.loc[mask_inverted, 'tpep_dropoff_datetime'] = temp


# Statistique

###Prix moyen d’une course (fare_amount)

In [0]:
# Moyenne sur ton échantillon
mean_fare_sample = df_sample['fare_amount'].mean()

In [0]:


n = len(df_sample)
mean_fare = df_sample['fare_amount'].mean()
std_fare = df_sample['fare_amount'].std()

ci_fare = stats.t.interval(
    0.95,              # niveau de confiance
    df=n-1,
    loc=mean_fare,
    scale=std_fare/np.sqrt(n)
)

print("Prix moyen :", mean_fare)
print("IC 95% fare_amount :", ci_fare)


Prix moyen : 17.86224662296859
IC 95% fare_amount : (np.float64(17.82621078497561), np.float64(17.89828246096157))


###Distance moyenne d’une course (trip_distance

In [0]:
mean_distance = df_sample['trip_distance'].mean()
std_distance = df_sample['trip_distance'].std()

ci_distance = stats.t.interval(
    0.95,
    df=n-1,
    loc=mean_distance,
    scale=std_distance/np.sqrt(n)
)

print("Distance moyenne :", mean_distance)
print("IC 95% trip_distance :", ci_distance)


Distance moyenne : 5.674406496121651
IC 95% trip_distance : (np.float64(4.779263993771752), np.float64(6.569548998471549))


###Durée moyenne des courses

In [0]:
df_sample['trip_duration_min'] = (
    df_sample['tpep_dropoff_datetime'] - df_sample['tpep_pickup_datetime']
).dt.total_seconds() / 60

df_sample[['trip_duration_min']].head()


Unnamed: 0,trip_duration_min
0,7.25
1,11.083333
2,7.2
3,0.166667
4,6.0


In [0]:
mean_duration = df_sample['trip_duration_min'].mean()
std_duration = df_sample['trip_duration_min'].std()

ci_duration = stats.t.interval(
    0.95,
    df=n-1,
    loc=mean_duration,
    scale=std_duration/np.sqrt(n)
)

print("Durée moyenne (min) :", mean_duration)
print("IC 95% durée :", ci_duration)


Durée moyenne (min) : 26.08164731478473
IC 95% durée : (np.float64(9.100326134524604), np.float64(43.062968495044856))


###Proportion des courses avec tip > 0

In [0]:
from scipy.stats import norm
prop_tip = (df_sample['tip_amount'] > 0).mean()

se = np.sqrt(prop_tip * (1 - prop_tip) / n)
z = norm.ppf(0.975)   # 95%

ci_tip = (
    prop_tip - z * se,
    prop_tip + z * se
)

print("Proportion tip > 0 :", prop_tip)
print("IC 95% proportion tip :", ci_tip)


Proportion tip > 0 : 0.7441480597335643
IC 95% proportion tip : (np.float64(0.74336453778844), np.float64(0.7449315816786887))


###Distribution des courses par heure/jour/semaine

In [0]:
# Par heure
df_sample['pickup_hour'] = df_sample['tpep_pickup_datetime'].dt.hour
hour_distribution = df_sample['pickup_hour'].value_counts().sort_index()
print(hour_distribution)

# Par jour de la semaine
df_sample['pickup_day'] = df_sample['tpep_pickup_datetime'].dt.day_name()
day_distribution = df_sample['pickup_day'].value_counts()
print(day_distribution)

# Par semaine du mois
df_sample['pickup_week'] = df_sample['tpep_pickup_datetime'].dt.isocalendar().week
week_distribution = df_sample['pickup_week'].value_counts()
print(week_distribution)


pickup_hour
0     33838
1     22319
2     14712
3      9714
4      6817
5      7430
6     17347
7     33189
8     45267
9     50505
10    54930
11    59665
12    64828
13    66850
14    71605
15    73617
16    73950
17    80636
18    84590
19    75486
20    67008
21    66606
22    61606
23    48842
Name: count, dtype: int64
pickup_day
Thursday     185599
Wednesday    179076
Friday       178507
Saturday     176492
Tuesday      172272
Monday       150161
Sunday       149250
Name: count, dtype: int64
pickup_week
50    27797
49    26683
43    26430
20    26177
19    25668
44    25559
42    25503
23    25496
45    25206
41    24996
14    24696
16    24526
18    24499
24    24470
10    24381
15    24301
39    24140
11    24113
13    23832
17    23729
12    23650
46    23638
25    23612
37    23523
36    23499
48    23449
40    23273
9     23124
22    22991
51    22809
21    22768
29    22623
6     22326
28    22230
7     22139
47    22090
8     22043
26    21833
30    21367
31    20953
5    

### Comparaison des fares selon zones géographiques (pickup / dropoff)

In [0]:
fare_by_zone = df_sample.groupby('PULocationID')['fare_amount'].agg(
    mean_fare='mean',
    std_fare='std',
    n='count'
)
fare_by_zone.sort_values(by='mean_fare', ascending=False).head(10)



Unnamed: 0_level_0,mean_fare,std_fare,n
PULocationID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,87.75,52.362549,214
115,75.1,14.000714,2
172,69.5,,1
265,69.377184,68.635248,1424
5,66.8,0.894427,5
23,61.184091,20.378358,22
44,60.755556,4.096069,9
132,55.649312,32.95469,59236
2,54.954,27.572814,5
8,54.415714,24.6771,7


### Intervalle de confiance 95 %

In [0]:
def ci_95(mean, std, n):
    z = stats.norm.ppf(0.975)
    return (
        mean - z * std / np.sqrt(n),
        mean + z * std / np.sqrt(n)
    )

fare_by_zone['IC_95'] = fare_by_zone.apply(
    lambda x: ci_95(x['mean_fare'], x['std_fare'], x['n']),
    axis=1
)

fare_by_zone.sort_values(by='mean_fare', ascending=False).head(10)

Unnamed: 0_level_0,mean_fare,std_fare,n,IC_95
PULocationID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,87.75,52.362549,214,"(80.73444577216253, 94.76555422783747)"
115,75.1,14.000714,2,"(55.69635655305346, 94.50364344694653)"
172,69.5,,1,"(nan, nan)"
265,69.377184,68.635248,1424,"(65.8123418466541, 72.94202613087398)"
5,66.8,0.894427,5,"(66.01601440618397, 67.58398559381602)"
23,61.184091,20.378358,22,"(52.6686735769936, 69.69950824118821)"
44,60.755556,4.096069,9,"(58.079506593287775, 63.431604517823345)"
132,55.649312,32.95469,59236,"(55.383929293926954, 55.91469451591841)"
2,54.954,27.572814,5,"(30.785806892787274, 79.12219310721271)"
8,54.415714,24.6771,7,"(36.13499873201269, 72.69642983941587)"


### Analyse des outliers (courses très longues / très chères)

In [0]:
Q1 = df_sample['fare_amount'].quantile(0.25)
Q3 = df_sample['fare_amount'].quantile(0.75)
IQR = Q3 - Q1

outliers_fare = df[
    (df_sample['fare_amount'] < Q1 - 1.5*IQR) |
    (df_sample['fare_amount'] > Q3 + 1.5*IQR)
]


In [0]:
mean_fare_all = df_sample['fare_amount'].mean()
mean_fare_no_out = df_sample.loc[~df.index.isin(outliers_fare.index), 'fare_amount'].mean()

mean_fare_all, mean_fare_no_out


(np.float64(17.86224662296859), np.float64(13.523906087387003))

In [0]:
def mean_ci(series):
    n = series.count()
    mean = series.mean()
    std = series.std()
    z = stats.norm.ppf(0.975)
    return mean, (mean - z*std/np.sqrt(n), mean + z*std/np.sqrt(n))

mean_ci(df['fare_amount'])
mean_ci(df.loc[~df.index.isin(outliers_fare.index), 'fare_amount'])


(np.float64(13.523906087387003),
 (np.float64(13.50943434117445), np.float64(13.538377833599556)))

### Ratio tip / fare par type de paiement (cash vs card)

In [0]:
payment_map = {
    1: 'Card',
    2: 'Cash'
}

df['payment_label'] = df['payment_type'].map(payment_map)

df['tip_fare_ratio'] = df['tip_amount'] / df['fare_amount']
df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=['tip_fare_ratio'])
ratio_stats = df.groupby('payment_label')['tip_fare_ratio'].agg(['mean','std','count'])

def confidence_interval(mean, std, n, alpha=0.05):
    z = stats.norm.ppf(1 - alpha/2)
    margin = z * (std / np.sqrt(n))
    return (mean - margin, mean + margin)

ratio_stats['IC_95'] = ratio_stats.apply(
    lambda x: confidence_interval(x['mean'], x['std'], x['count']),
    axis=1
)

ratio_stats



Unnamed: 0_level_0,mean,std,count,IC_95
payment_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Card,0.265797,8.517975,904600,"(0.24824334296934775, 0.2833497334255201)"
Cash,-3.3e-05,0.004989,197256,"(-5.496869596676581e-05, -1.0937535165268912e-05)"
