In [1]:
import pandas as pd
import numpy as np

print("Pandas Version: " + pd.__version__)
print("Numpy Version: " + np.__version__)

Pandas Version: 1.2.4
Numpy Version: 1.18.5


In [2]:
trips = pd.read_csv("../data/clean/trips.csv", parse_dates=["date_from", "date_until"])
trips["time_since_last_checkout"] = pd.to_timedelta(trips["time_since_last_checkout"])
trips.head(3)

Unnamed: 0,bike_id,user_id,date_from,date_until,start_station_name,start_station_id,end_station_name,end_station_id,booked_via,duration_in_min,distance_in_km,speed_in_kmh,time_since_last_checkout
0,119911,341973A96CDE0DF9792F6C844622735AE4216EBF,2014-01-01 00:02:51,2014-01-01 00:20:04,Enckeplatz / Hütten,131887,Königstraße / Struenseestraße,131650,Terminal HH_8 (-2624-),18,2.364129,7.880431,
1,118994,665D79F269FA03F84FC61F3A7F7B078D7392EC0E,2014-01-01 00:07:45,2014-01-01 00:10:48,Isestraße / Hoheluftbrücke,140804,Isestraße / Hoheluftbrücke,140804,Terminal HH_63 (-2241-),4,0.0,0.0,
2,143660,B46B52FDC494E46849DB84BF84F0B99C78358E59,2014-01-01 00:09:55,2014-01-01 00:26:20,Schulterblatt/Eifflerstraße,131648,Schulterblatt/Eifflerstraße,131648,Android SRH,17,0.0,0.0,


In [3]:
user_groups = trips.groupby(["user_id"])

In [4]:
last_moment = trips.date_until.max()

In [5]:
user_trip_dates = trips.groupby(["user_id"], as_index=False).agg({"date_from": [np.min, np.max]})
user_trip_dates.columns = pd.Index(["user_id", "date_from_min", "date_from_max"])
user_trip_dates.set_index("user_id", inplace=True)

In [6]:
trips_count = user_groups.size().to_frame()
trips_count.rename(columns = {0: "trips_count"}, inplace=True)

In [7]:
active_period_in_days = user_groups.apply(
    lambda x: (x.date_from.max() - x.date_from.min()).days
).to_frame()
active_period_in_days.rename(columns = {0: "active_period_in_days"}, inplace=True)

In [8]:
days_since_first_activity = user_groups.apply(
    lambda x: (last_moment - x.date_from.min()).days
).to_frame()
days_since_first_activity.rename(columns = {0: "days_since_first_activity"}, inplace=True)

In [9]:
days_since_last_activity = user_groups.apply(
    lambda x: (last_moment - x.date_from.max()).days
).to_frame()
days_since_last_activity.rename(columns = {0: "days_since_last_activity"}, inplace=True)

In [10]:
mean_duration_in_min = user_groups.apply(
    lambda x: x[x.duration_in_min > 0].duration_in_min.mean() # exclude all <= 0 (round trips)
).to_frame()
mean_duration_in_min.rename(columns = {0: "mean_duration_in_min"}, inplace=True)

In [11]:
mean_distance_in_km = user_groups.apply(
    lambda x: x[x.distance_in_km > 0].distance_in_km.mean() # exclude all <= 0 (round trips)
).to_frame()
mean_distance_in_km.rename(columns = {0: "mean_distance_in_km"}, inplace=True)

In [12]:
mean_speed_in_kmh = user_groups.apply(
    lambda x: x[x.speed_in_kmh > 0].speed_in_kmh.mean()  # exclude all <= 0 (round trips)
).to_frame()
mean_speed_in_kmh.rename(columns = {0: "mean_speed_in_kmh"}, inplace=True)

In [13]:
users = pd.merge(user_trip_dates, trips_count, left_index=True, right_index=True)
users = pd.merge(users, active_period_in_days, left_index=True, right_index=True)
users = pd.merge(users, days_since_first_activity, left_index=True, right_index=True)
users = pd.merge(users, days_since_last_activity, left_index=True, right_index=True)
users = pd.merge(users, mean_duration_in_min, left_index=True, right_index=True)
users = pd.merge(users, mean_distance_in_km, left_index=True, right_index=True)
users = pd.merge(users, mean_speed_in_kmh, left_index=True, right_index=True)

In [14]:
users["trips_per_day"] = users["active_period_in_days"] / users["trips_count"]

In [38]:
# Calculate the mean time between trips in hours
# Do only include downtimes that are longer than 3 minutes so that most double bookings are ignored

mean_time_between_trips = trips[["user_id", "time_since_last_checkout"]].dropna()
mean_time_between_trips = mean_time_between_trips[
    mean_time_between_trips.time_since_last_checkout > pd.Timedelta(3, "minutes")
]
mean_time_between_trips = mean_time_between_trips.groupby("user_id").mean(numeric_only=False)

users = pd.merge(
    users, mean_time_between_trips, left_index=True, right_index=True
).rename(columns={"time_since_last_checkout": "mean_time_between_trips"})

In [39]:
users["trips_per_day"] = users["active_period_in_days"] / users["trips_count"]

In [40]:
users.sample(5)

Unnamed: 0_level_0,date_from_min,date_from_max,trips_count,active_period_in_days,days_since_first_activity,days_since_last_activity,mean_duration_in_min,mean_distance_in_km,mean_speed_in_kmh,trips_per_day,mean_time_between_trips
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
D43075764E2F330F244946696D5CD9646EB23885,2015-08-10 11:56:59,2016-09-09 20:52:16,3,396,644,248,22.333333,2.751248,6.910047,132.0,396 days 08:54:19
F3AB12A148C00EFAE166825CB75EDB39D8C440FA,2014-08-25 09:33:40,2016-09-22 17:51:57,38,759,994,235,18.473684,2.283686,7.428907,19.973684,21 days 02:13:49.833333333
923011A158A810DD3F12CA3E5B720DA18E35244A,2014-02-24 14:39:50,2014-07-28 18:42:43,6,154,1176,1022,13.333333,1.545742,6.88648,25.666667,30 days 20:00:34.600000
EC2FA8D87580223E66EA05B3D639016102606F24,2016-06-08 15:00:19,2016-06-09 16:58:15,4,1,341,340,20.25,2.131582,6.248904,0.25,0 days 08:39:18.666666666
8BCBFA6EA5C321D04D0DF9301A38D5D09FF028C1,2014-01-01 16:34:32,2017-04-27 10:02:07,43,1211,1230,18,17.395349,2.322784,8.207407,28.162791,29 days 13:18:12.219512195


In [41]:
users.to_csv("../data/clean/users.csv", index=True)