## 1. IMPORTING PYTHON LIBRARIES

In [1]:
# to work with data: dataframes, statistics & regular expressions
import pandas as pd
import numpy as np
import re
import pandas_profiling as pdp # suggested by Dani in Slack

# for data viz
%matplotlib inline
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns

## 2. DATA WRANGLING 

In [2]:
def importing_csv(csv_path):
    df = pd.read_csv(csv_path)
    return df

def raw(df):
    print('shape:',df.shape)
    print('\n columns:',df.shape)
    print('\n variables info:')    
    return df.info(),df.describe()

def valcount(df, var):
    return df[var].value_counts()

def split_datetime(df,datetime):
    year,month,date,time = str(datetime+'_year'),str(datetime+'_month'),str(datetime+'_date'),str(datetime+'_time')
    df[year] = pd.to_datetime(df[datetime]).dt.year
    df[month] = pd.to_datetime(df[datetime]).dt.month
    df[date] = pd.to_datetime(df[datetime]).dt.date
    df[time] = pd.to_datetime(df[datetime]).dt.time
    return df[[datetime,year,month,date,time]]

def second_to_min(df,second):
    minutes = str(second+'_min')
    df[minutes]=df[second]/60
    return df[minutes,second]

def centavos_a_pesos(df,centavos):
    pesos = str(centavos+'_pesos')
    df[pesos] = df[centavos]/100
    return df[pesos,centavos]

In [3]:
csv_path = './db/tableau_project.csv'
data = importing_csv(csv_path)

In [4]:
raw(data)

shape: (3864, 22)

 columns: (3864, 22)

 variables info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3864 entries, 0 to 3863
Data columns (total 22 columns):
Unnamed: 0             3864 non-null int64
journey_id             3864 non-null object
vehicle_type_id        3864 non-null object
start_type             3864 non-null object
start_at               3864 non-null object
start_lat              3864 non-null float64
start_lon              3864 non-null float64
end_at                 3864 non-null object
end_lat                3807 non-null float64
end_lon                3807 non-null float64
end_state              3864 non-null object
price                  3844 non-null object
price_distance         2812 non-null object
price_duration         2812 non-null object
distance               3844 non-null object
duration               3844 non-null object
source                 3864 non-null object
rider_waiting_time     2733 non-null object
driver_waiting_time    2685 non-null obj

(None,
         Unnamed: 0    start_lat    start_lon      end_lat      end_lon  \
 count  3864.000000  3864.000000  3864.000000  3807.000000  3807.000000   
 mean   1931.500000    19.412159   -99.195443    19.410835   -99.197221   
 std    1115.585048     0.028146     0.040832     0.030001     0.044896   
 min       0.000000    19.290513   -99.288274    19.196463   -99.655665   
 25%     965.750000    19.405320   -99.207814    19.379486   -99.253980   
 50%    1931.500000    19.422749   -99.175834    19.422749   -99.178435   
 75%    2897.250000    19.426613   -99.170483    19.429759   -99.171955   
 max    3863.000000    19.513544   -99.051114    19.517451   -99.048817   
 
             rating  
 count  3864.000000  
 mean      8.289596  
 std       1.265744  
 min       4.000000  
 25%       8.000000  
 50%       9.000000  
 75%       9.000000  
 max      10.000000  )

In [5]:
data.isna().sum()

Unnamed: 0                0
journey_id                0
vehicle_type_id           0
start_type                0
start_at                  0
start_lat                 0
start_lon                 0
end_at                    0
end_lat                  57
end_lon                  57
end_state                 0
price                    20
price_distance         1052
price_duration         1052
distance                 20
duration                 20
source                    0
rider_waiting_time     1131
driver_waiting_time    1179
price_supplements         0
discount                 20
rating                    0
dtype: int64

In [6]:
data.nunique()

Unnamed: 0             3864
journey_id             3864
vehicle_type_id           2
start_type                2
start_at               3861
start_lat              1403
start_lon              1356
end_at                 3864
end_lat                 498
end_lon                 493
end_state                 5
price                  1417
price_distance         2171
price_duration          585
distance               2237
duration               1606
source                    3
rider_waiting_time     1076
driver_waiting_time     628
price_supplements        32
discount                  3
rating                    6
dtype: int64

In [11]:
pdp.ProfileReport(data)

0,1
Number of variables,22
Number of observations,3864
Total Missing (%),5.4%
Total size in memory,664.2 KiB
Average record size in memory,176.0 B

0,1
Numeric,6
Categorical,14
Boolean,0
Date,0
Text (Unique),2
Rejected,0
Unsupported,0

0,1
Distinct count,3864
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1931.5
Minimum,0
Maximum,3863
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,193.15
Q1,965.75
Median,1931.5
Q3,2897.2
95-th percentile,3669.8
Maximum,3863.0
Range,3863.0
Interquartile range,1931.5

0,1
Standard deviation,1115.6
Coef of variation,0.57757
Kurtosis,-1.2
Mean,1931.5
MAD,966
Skewness,0
Sum,7463316
Variance,1244500
Memory size,30.3 KiB

Value,Count,Frequency (%),Unnamed: 3
2047,1,0.0%,
721,1,0.0%,
657,1,0.0%,
2704,1,0.0%,
653,1,0.0%,
2700,1,0.0%,
649,1,0.0%,
2696,1,0.0%,
645,1,0.0%,
2692,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1,0.0%,
1,1,0.0%,
2,1,0.0%,
3,1,0.0%,
4,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
3859,1,0.0%,
3860,1,0.0%,
3861,1,0.0%,
3862,1,0.0%,
3863,1,0.0%,

0,1
Distinct count,4
Unique (%),0.1%
Missing (%),0.5%
Missing (n),20

0,1
0,3842
4000,1
6317,1
(Missing),20

Value,Count,Frequency (%),Unnamed: 3
0,3842,99.4%,
4000,1,0.0%,
6317,1,0.0%,
(Missing),20,0.5%,

0,1
Distinct count,2238
Unique (%),57.9%
Missing (%),0.5%
Missing (n),20

0,1
0,1141
3241,5
3031,4
Other values (2234),2694
(Missing),20

Value,Count,Frequency (%),Unnamed: 3
0,1141,29.5%,
3241,5,0.1%,
3031,4,0.1%,
3307,4,0.1%,
3206,4,0.1%,
3147,4,0.1%,
3392,4,0.1%,
3295,4,0.1%,
3164,4,0.1%,
2763,4,0.1%,

0,1
Distinct count,629
Unique (%),16.3%
Missing (%),30.5%
Missing (n),1179

0,1
4,95
3,95
11,78
Other values (625),2417
(Missing),1179

Value,Count,Frequency (%),Unnamed: 3
4,95,2.5%,
3,95,2.5%,
11,78,2.0%,
2,71,1.8%,
5,70,1.8%,
7,64,1.7%,
1,54,1.4%,
6,53,1.4%,
12,52,1.3%,
10,47,1.2%,

0,1
Distinct count,1607
Unique (%),41.6%
Missing (%),0.5%
Missing (n),20

0,1
0,1132
614,7
674,7
Other values (1603),2698
(Missing),20

Value,Count,Frequency (%),Unnamed: 3
0,1132,29.3%,
614,7,0.2%,
674,7,0.2%,
1039,7,0.2%,
917,7,0.2%,
841,7,0.2%,
744,6,0.2%,
649,6,0.2%,
1057,6,0.2%,
489,6,0.2%,

First 3 values

Last 3 values

Value,Count,Frequency (%),Unnamed: 3
2017-01-23 14:59:46,1,0.0%,
2017-01-23 15:57:39,1,0.0%,
2017-01-23 19:09:56,1,0.0%,
2017-01-23 20:10:19,1,0.0%,
2017-01-23 21:58:24,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2017-12-21 21:46:16,1,0.0%,
2017-12-21 23:16:54,1,0.0%,
2017-12-21 23:41:15,1,0.0%,
2017-12-22 16:39:39,1,0.0%,
2017-12-22 22:10:50,1,0.0%,

0,1
Distinct count,499
Unique (%),12.9%
Missing (%),1.5%
Missing (n),57
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,19.411
Minimum,19.196
Maximum,19.517
Zeros (%),0.0%

0,1
Minimum,19.196
5-th percentile,19.358
Q1,19.379
Median,19.423
Q3,19.43
95-th percentile,19.441
Maximum,19.517
Range,0.32099
Interquartile range,0.050273

0,1
Standard deviation,0.030001
Coef of variation,0.0015456
Kurtosis,1.4827
Mean,19.411
MAD,0.024438
Skewness,-0.81215
Sum,73897
Variance,0.00090005
Memory size,30.3 KiB

Value,Count,Frequency (%),Unnamed: 3
19.4227491,282,7.3%,
19.3769093949,204,5.3%,
19.4230911,182,4.7%,
19.3591670605,168,4.3%,
19.438666,134,3.5%,
19.3772193,123,3.2%,
19.441434215999998,112,2.9%,
19.379485831300002,103,2.7%,
19.4219595249,102,2.6%,
19.406537533199998,94,2.4%,

Value,Count,Frequency (%),Unnamed: 3
19.1964633,1,0.0%,
19.2117022,1,0.0%,
19.2604216,1,0.0%,
19.2826098,1,0.0%,
19.2901477,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
19.4991068,9,0.2%,
19.5026657,1,0.0%,
19.503963,1,0.0%,
19.510239,1,0.0%,
19.5174513,1,0.0%,

0,1
Distinct count,494
Unique (%),12.8%
Missing (%),1.5%
Missing (n),57
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,-99.197
Minimum,-99.656
Maximum,-99.049
Zeros (%),0.0%

0,1
Minimum,-99.656
5-th percentile,-99.273
Q1,-99.254
Median,-99.178
Q3,-99.172
95-th percentile,-99.16
Maximum,-99.049
Range,0.60685
Interquartile range,0.082024

0,1
Standard deviation,0.044896
Coef of variation,-0.00045259
Kurtosis,4.1208
Mean,-99.197
MAD,0.036002
Skewness,-0.73337
Sum,-377640
Variance,0.0020156
Memory size,30.3 KiB

Value,Count,Frequency (%),Unnamed: 3
-99.1749373,283,7.3%,
-99.2549008504,204,5.3%,
-99.1702686,188,4.9%,
-99.2704596743,168,4.3%,
-99.1808212,134,3.5%,
-99.2545027,123,3.2%,
-99.18350312860001,112,2.9%,
-99.2539795116,103,2.7%,
-99.1745589674,102,2.6%,
-99.16970986870001,94,2.4%,

Value,Count,Frequency (%),Unnamed: 3
-99.6556653,1,0.0%,
-99.5686536,1,0.0%,
-99.4663993,1,0.0%,
-99.312057,1,0.0%,
-99.288139,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
-99.077809,32,0.8%,
-99.0776668,2,0.1%,
-99.077248,2,0.1%,
-99.0719083,13,0.3%,
-99.0488165,2,0.1%,

0,1
Distinct count,5
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
drop off,2685
rider cancel,1146
not found,20
Other values (2),13

Value,Count,Frequency (%),Unnamed: 3
drop off,2685,69.5%,
rider cancel,1146,29.7%,
not found,20,0.5%,
no show,12,0.3%,
driver cancel,1,0.0%,

First 3 values

Last 3 values

Value,Count,Frequency (%),Unnamed: 3
000e7c243bac43ba945ded4e478d0476,1,0.0%,
003b7af063904ee29655ba466c2b0041,1,0.0%,
003d96fc81b54137adb9098beaa58ddb,1,0.0%,
00569003792349ff8fbab325c9e23785,1,0.0%,
0078d0c73f9849d4b12b7a6ea21869a9,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
ffb32d9f9ce04eff96684b579f3cb63b,1,0.0%,
ffd0efef7309dfea2822bd1fa2c14801,1,0.0%,
ffe9da9a4a0b4200b2ad609b308dea57,1,0.0%,
fff304ce0361454ba398d731209dd750,1,0.0%,
fff6ef585bbb47d6b06c5c1e5beec85c,1,0.0%,

0,1
Distinct count,1418
Unique (%),36.7%
Missing (%),0.5%
Missing (n),20

0,1
0,1083
4000,1064
4500,103
Other values (1414),1594
(Missing),20

Value,Count,Frequency (%),Unnamed: 3
0,1083,28.0%,
4000,1064,27.5%,
4500,103,2.7%,
10000,14,0.4%,
4404,5,0.1%,
4225,5,0.1%,
4573,4,0.1%,
4237,4,0.1%,
4547,4,0.1%,
4217,4,0.1%,

0,1
Distinct count,2172
Unique (%),56.2%
Missing (%),27.2%
Missing (n),1052

0,1
0,141
3453,14
3259,6
Other values (2168),2651
(Missing),1052

Value,Count,Frequency (%),Unnamed: 3
0,141,3.6%,
3453,14,0.4%,
3259,6,0.2%,
4225,5,0.1%,
3335,5,0.1%,
4404,5,0.1%,
3361,5,0.1%,
4547,4,0.1%,
3322,4,0.1%,
4393,4,0.1%,

0,1
Distinct count,586
Unique (%),15.2%
Missing (%),27.2%
Missing (n),1052

0,1
0,2087
5,5
24,5
Other values (582),715
(Missing),1052

Value,Count,Frequency (%),Unnamed: 3
0,2087,54.0%,
5,5,0.1%,
24,5,0.1%,
120,4,0.1%,
20,4,0.1%,
142,4,0.1%,
79,4,0.1%,
4,3,0.1%,
7,3,0.1%,
76,3,0.1%,

0,1
Distinct count,32
Unique (%),0.8%
Missing (%),0.0%
Missing (n),0

0,1
0,3781
800,14
1254,13
Other values (29),56

Value,Count,Frequency (%),Unnamed: 3
0,3781,97.9%,
800,14,0.4%,
1254,13,0.3%,
922,10,0.3%,
3252,6,0.2%,
2511,6,0.2%,
662,4,0.1%,
1477,3,0.1%,
1338,2,0.1%,
805,2,0.1%,

0,1
Distinct count,6
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,8.2896
Minimum,4
Maximum,10
Zeros (%),0.0%

0,1
Minimum,4
5-th percentile,6
Q1,8
Median,9
Q3,9
95-th percentile,10
Maximum,10
Range,6
Interquartile range,1

0,1
Standard deviation,1.2657
Coef of variation,0.15269
Kurtosis,-0.7122
Mean,8.2896
MAD,1.0565
Skewness,-0.52975
Sum,32031
Variance,1.6021
Memory size,30.3 KiB

Value,Count,Frequency (%),Unnamed: 3
9.0,1390,36.0%,
8.0,930,24.1%,
10.0,616,15.9%,
6.0,572,14.8%,
7.0,355,9.2%,
4.0,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
4.0,1,0.0%,
6.0,572,14.8%,
7.0,355,9.2%,
8.0,930,24.1%,
9.0,1390,36.0%,

Value,Count,Frequency (%),Unnamed: 3
6.0,572,14.8%,
7.0,355,9.2%,
8.0,930,24.1%,
9.0,1390,36.0%,
10.0,616,15.9%,

0,1
Distinct count,1077
Unique (%),27.9%
Missing (%),29.3%
Missing (n),1131

0,1
369,12
317,11
503,10
Other values (1073),2700
(Missing),1131

Value,Count,Frequency (%),Unnamed: 3
369,12,0.3%,
317,11,0.3%,
503,10,0.3%,
406,10,0.3%,
178,9,0.2%,
268,9,0.2%,
190,9,0.2%,
275,9,0.2%,
238,8,0.2%,
245,8,0.2%,

0,1
Distinct count,3
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
iPhone,2631
Android,1218
web,15

Value,Count,Frequency (%),Unnamed: 3
iPhone,2631,68.1%,
Android,1218,31.5%,
web,15,0.4%,

0,1
Distinct count,3861
Unique (%),99.9%
Missing (%),0.0%
Missing (n),0

0,1
2017-10-26 12:45:00,2
2017-08-02 12:45:00,2
2017-10-27 05:07:38,2
Other values (3858),3858

Value,Count,Frequency (%),Unnamed: 3
2017-10-26 12:45:00,2,0.1%,
2017-08-02 12:45:00,2,0.1%,
2017-10-27 05:07:38,2,0.1%,
2017-08-07 12:45:00,1,0.0%,
2017-06-22 01:20:14,1,0.0%,
2017-07-11 13:48:30,1,0.0%,
2017-12-04 19:08:41,1,0.0%,
2017-06-24 02:38:28,1,0.0%,
2017-11-15 11:45:00,1,0.0%,
2017-08-16 02:01:05,1,0.0%,

0,1
Distinct count,1403
Unique (%),36.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,19.412
Minimum,19.291
Maximum,19.514
Zeros (%),0.0%

0,1
Minimum,19.291
5-th percentile,19.357
Q1,19.405
Median,19.423
Q3,19.427
95-th percentile,19.441
Maximum,19.514
Range,0.22303
Interquartile range,0.021293

0,1
Standard deviation,0.028146
Coef of variation,0.0014499
Kurtosis,0.39288
Mean,19.412
MAD,0.02254
Skewness,-0.72531
Sum,75009
Variance,0.00079217
Memory size,30.3 KiB

Value,Count,Frequency (%),Unnamed: 3
19.3769093949,231,6.0%,
19.4230911,170,4.4%,
19.3591670605,151,3.9%,
19.406537533199998,116,3.0%,
19.4199617779,97,2.5%,
19.379485831300002,97,2.5%,
19.3772193538,89,2.3%,
19.3560000407,86,2.2%,
19.4227491,75,1.9%,
19.422973,73,1.9%,

Value,Count,Frequency (%),Unnamed: 3
19.2905129169,1,0.0%,
19.2953111437,1,0.0%,
19.2956057,3,0.1%,
19.2957620323,1,0.0%,
19.3045874758,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
19.4993487045,1,0.0%,
19.502273056,1,0.0%,
19.5023141413,1,0.0%,
19.5135346406,1,0.0%,
19.5135444482,1,0.0%,

0,1
Distinct count,1356
Unique (%),35.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,-99.195
Minimum,-99.288
Maximum,-99.051
Zeros (%),0.0%

0,1
Minimum,-99.288
5-th percentile,-99.274
Q1,-99.208
Median,-99.176
Q3,-99.17
95-th percentile,-99.165
Maximum,-99.051
Range,0.23716
Interquartile range,0.037331

0,1
Standard deviation,0.040832
Coef of variation,-0.00041163
Kurtosis,0.057546
Mean,-99.195
MAD,0.033135
Skewness,-0.65951
Sum,-383290
Variance,0.0016672
Memory size,30.3 KiB

Value,Count,Frequency (%),Unnamed: 3
-99.2549008504,231,6.0%,
-99.1702686,172,4.5%,
-99.2704596743,151,3.9%,
-99.16970986870001,116,3.0%,
-99.17195521299999,97,2.5%,
-99.2539795116,97,2.5%,
-99.25450254229999,89,2.3%,
-99.2752685398,86,2.2%,
-99.1749373,76,2.0%,
-99.170483,73,1.9%,

Value,Count,Frequency (%),Unnamed: 3
-99.2882741,2,0.1%,
-99.288245067,1,0.0%,
-99.2851202,2,0.1%,
-99.279825,1,0.0%,
-99.2792588592,2,0.1%,

Value,Count,Frequency (%),Unnamed: 3
-99.0799467,1,0.0%,
-99.077809,19,0.5%,
-99.077248,2,0.1%,
-99.0719083,2,0.1%,
-99.0511143208,1,0.0%,

0,1
Distinct count,2
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
asap,3615
reserved,249

Value,Count,Frequency (%),Unnamed: 3
asap,3615,93.6%,
reserved,249,6.4%,

0,1
Distinct count,2
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
21620ea5749f2e0679a8c72c7fbafb9e,3642
077866c3fd1a75f51ca7f8eae166ae32,222

Value,Count,Frequency (%),Unnamed: 3
21620ea5749f2e0679a8c72c7fbafb9e,3642,94.3%,
077866c3fd1a75f51ca7f8eae166ae32,222,5.7%,

Unnamed: 0.1,Unnamed: 0,journey_id,vehicle_type_id,start_type,start_at,start_lat,start_lon,end_at,end_lat,end_lon,end_state,price,price_distance,price_duration,distance,duration,source,rider_waiting_time,driver_waiting_time,price_supplements,discount,rating
0,0,93e6d216088af74c32183283d4bb2953,077866c3fd1a75f51ca7f8eae166ae32,asap,2017-12-16 11:08:59,19.431363,-99.195362,2017-12-16 11:37:32,19.406438,-99.17526,drop off,5866,5204,0,7943,1058,Android,590.0,48.0,662,0,9.0
1,1,444b084ec64244919afadfba879876a3,077866c3fd1a75f51ca7f8eae166ae32,asap,2017-12-16 11:06:16,19.431567,-99.195368,2017-12-16 11:07:21,,,rider cancel,0,0,0,0,0,iPhone,,,0,0,9.0
2,2,dbcc57f4e7606a71c74c6e4666a977a4,077866c3fd1a75f51ca7f8eae166ae32,asap,2017-11-24 22:58:37,19.422052,-99.174499,2017-11-24 23:27:05,19.438666,-99.180821,drop off,4000,3552,0,3054,689,Android,923.0,4.0,0,0,10.0
3,3,1ffe9a24033847148c683574985df56c,077866c3fd1a75f51ca7f8eae166ae32,asap,2017-12-16 09:17:08,19.433119,-99.154844,2017-12-16 09:17:30,19.290148,-99.144407,rider cancel,0,0,0,0,0,iPhone,,,0,0,9.0
4,4,fb1ef7d070724482a33be41f27737ea5,077866c3fd1a75f51ca7f8eae166ae32,asap,2017-11-24 18:11:00,19.440996,-99.183581,2017-11-24 18:31:31,19.422815,-99.174865,drop off,4000,2974,12,2859,887,iPhone,329.0,36.0,0,0,8.0


In [12]:
data.columns

Index(['Unnamed: 0', 'journey_id', 'vehicle_type_id', 'start_type', 'start_at',
       'start_lat', 'start_lon', 'end_at', 'end_lat', 'end_lon', 'end_state',
       'price', 'price_distance', 'price_duration', 'distance', 'duration',
       'source', 'rider_waiting_time', 'driver_waiting_time',
       'price_supplements', 'discount', 'rating'],
      dtype='object')

In [None]:
valcount(data, 'vehicle_type_id')

In [26]:
data.loc[data.vehicle_type_id == '21620ea5749f2e0679a8c72c7fbafb9e', 'vehicle_type_id'] = 'A'
data.loc[data.vehicle_type_id == '077866c3fd1a75f51ca7f8eae166ae32', 'vehicle_type_id'] = 'B'
valcount(data, 'vehicle_type_id')

A    3642
B     222
Name: vehicle_type_id, dtype: int64

In [28]:
valcount(data, 'start_type')

asap        3615
reserved     249
Name: start_type, dtype: int64

In [29]:
valcount(data, 'start_type')

asap        3615
reserved     249
Name: start_type, dtype: int64

In [None]:
valcount(data, 'vehicle_type_id')

In [33]:
data.dtypes

Unnamed: 0               int64
journey_id              object
vehicle_type_id         object
start_type              object
start_at                object
start_lat              float64
start_lon              float64
end_at                  object
end_lat                float64
end_lon                float64
end_state               object
price                   object
price_distance          object
price_duration          object
distance                object
duration                object
source                  object
rider_waiting_time      object
driver_waiting_time     object
price_supplements       object
discount                object
rating                 float64
dtype: object

In [5]:
data = data.drop(columns="Unnamed: 0")

In [6]:
data[['distance','driver_waiting_time','rider_waiting_time', 'price','price_duration' , 'price_distance', 'price_supplements']].apply(lambda x: x.fillna(0))

Unnamed: 0,distance,driver_waiting_time,rider_waiting_time,price,price_duration,price_distance,price_supplements
0,7943,48,590,5866,0,5204,662
1,0,0,0,0,0,0,0
2,3054,4,923,4000,0,3552,0
3,0,0,0,0,0,0,0
4,2859,36,329,4000,12,2974,0
5,3176,369,73,4000,290,3346,0
6,0,0,0,0,0,0,0
7,3008,799,770,6449,2082,4367,0
8,0,0,0,0,0,0,0
9,3034,1039,1655,8063,0,4063,0


In [None]:
# convert to object format: 'vehicle_type_id', 'start_type', 'source','end_state'
# convert to geoespatial (num.num): 'start_lat', 'start_lon', ' 'end_lat', 'end_lon', 
# convert to float: 'price', 'price_distance', 'price_duration', 'distance', 'duration', 'price_supplements', 'discount', 'rating', rider_waiting_time', 'driver_waiting_time',

In [7]:
float_var = ['price','price_distance','price_duration','distance','duration','price_supplements','discount','rating','rider_waiting_time','driver_waiting_time']
for var in float_var:
    data_num.apply(lambda x: str(x.replace(',','.'))).astype('float')
    data[var]=data[var].astype(str).astype(float)
#float_var.apply(lambda col: col.astype(str).astype(int))

ValueError: could not convert string to float: '5,866'

In [133]:
data_num = data[['price','price_distance','price_duration','distance','duration','price_supplements','discount','rating','rider_waiting_time','driver_waiting_time']]
data_num.apply(lambda x: str(x.replace(',','.'))).astype('float')
pd.to_numeric(data_num)
data_num.head()

ValueError: could not convert string to float: '0        5,866\n1            0\n2        4,000\n3            0\n4        4,000\n5        4,000\n6            0\n7        6,449\n8            0\n9        8,063\n10           0\n11       6,924\n12       4,000\n13           0\n14           0\n15       9,404\n16           0\n17       5,299\n18       4,000\n19           0\n20           0\n21       4,000\n22       4,000\n23       4,000\n24       6,582\n25           0\n26       4,000\n27       5,438\n28           0\n29           0\n         ...  \n3834         0\n3835     4,000\n3836         0\n3837         0\n3838         0\n3839    17,800\n3840    17,503\n3841    29,387\n3842    10,852\n3843         0\n3844     4,000\n3845         0\n3846         0\n3847    18,610\n3848     4,000\n3849     4,421\n3850     4,000\n3851     4,281\n3852     7,544\n3853    17,681\n3854     4,512\n3855         0\n3856    11,052\n3857    29,892\n3858    11,031\n3859     4,134\n3860     5,752\n3861     4,000\n3862     4,000\n3863     4,000\nName: price, Length: 3864, dtype: object'

In [48]:
data[['start_type']].apply(lambda x: x.astype('category'))

Unnamed: 0,start_type
0,asap
1,asap
2,asap
3,asap
4,asap
5,asap
6,asap
7,asap
8,asap
9,reserved


In [106]:
# we convert currency format: from centavos to pesos
centavos_a_pesos(data,'price')
centavos_a_pesos(data,'price_duration')
centavos_a_pesos(data,'price_distance')
centavos_a_pesos(data,'price_supplements')

TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [107]:
second_to_min(data,'distance')
second_to_min(data,'driver_waiting_time')
second_to_min(data,'rider_waiting_time')

TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [None]:
split_datetime(data, 'start_at')

In [69]:
split_datetime(data, 'end_at')

Unnamed: 0,end_at,end_at_year,end_at_month,end_at_date,end_at_time
0,2017-12-16 11:37:32,2017,12,2017-12-16,11:37:32
1,2017-12-16 11:07:21,2017,12,2017-12-16,11:07:21
2,2017-11-24 23:27:05,2017,11,2017-11-24,23:27:05
3,2017-12-16 09:17:30,2017,12,2017-12-16,09:17:30
4,2017-11-24 18:31:31,2017,11,2017-11-24,18:31:31
5,2017-11-24 15:53:30,2017,11,2017-11-24,15:53:30
6,2017-11-24 15:10:42,2017,11,2017-11-24,15:10:42
7,2017-11-24 15:47:42,2017,11,2017-11-24,15:47:42
8,2017-11-24 14:32:56,2017,11,2017-11-24,14:32:56
9,2017-11-24 14:36:44,2017,11,2017-11-24,14:36:44


### Data wrangling decisions:

* **relevant KPIs without missing values:** 
- duration --> second to minutes format
- price --> float format
- start_type --> object format
- vehicle_type_id --> 2 types --> rename with 'A'/'B'
- start_lon & start_lat --> numeric format
- start_at & end_at --> from categorical to datetime format 
- source
- rating
- end_state
* **relevant KPIs with missing values:** 
- distance: replace with '0'
- driver_waiting_time & rider_waiting_time: replace with '0' --> from categorical to time format
- end_lat & end_long: keep NaN --> can be canceled or rider not found
- price_duration & price_distance & price supplements: replace with '0'
- discount: drop var --> only 2 values
* ** NON relevant KPIs without missing values:** 'journey_id'

