In [104]:
import pandas as pd
import numpy as np

print("Pandas Version: " + pd.__version__)
print("Numpy Version: " + np.__version__)

Pandas Version: 1.2.4
Numpy Version: 1.18.5


In [105]:
trips = pd.read_csv("../data/clean/trips.csv", parse_dates=["date_from", "date_until"])
trips.head(3)

Unnamed: 0,bike_id,user_id,date_from,date_until,start_station_name,start_station_id,end_station_name,end_station_id,booked_via,duration_in_min,distance_in_km,speed_in_kmh
0,143517,A821059B555C7764A2FF801180874A2FCB326222,2014-01-01 00:34:54,2014-01-01 00:50:14,U-Bahn Baumwall,214170,Mönckebergstraße / Rosenstraße,131880,iPhone SRH,16,1.293661,4.851229
1,119830,1EBC930DB407ACEAE2FDE23A6CA40492EA3DFBB2,2014-01-01 01:39:55,2014-01-01 01:57:27,Bahnhof Altona Ost/Max-Brauer-Allee,131646,Schulterblatt/Eifflerstraße,131648,Android SRH,18,2.032271,6.774236
2,143501,7AD2C1B70137479062A6DD73815835986677BB2D,2014-01-01 01:40:20,2014-01-01 01:53:09,Weidestraße/Biedermannplatz,211922,Jarrestraße / Rambatzweg,138376,Techniker HH_119 (-2334-),13,0.954178,4.403899


In [106]:
user_groups = trips.groupby(["user_id"])

In [107]:
last_moment = trips.date_until.max()

In [108]:
user_trip_dates = trips.groupby(["user_id"], as_index=False).agg({"date_from": [np.min, np.max]})
user_trip_dates.columns = pd.Index(["user_id", "date_from_min", "date_from_max"])
user_trip_dates.set_index("user_id", inplace=True)

In [109]:
trips_count = user_groups.size().to_frame()
trips_count.rename(columns = {0: "trips_count"}, inplace=True)

In [110]:
active_period_in_days = user_groups.apply(
    lambda x: (x.date_from.max() - x.date_from.min()).days
).to_frame()
active_period_in_days.rename(columns = {0: "active_period_in_days"}, inplace=True)

In [111]:
days_since_first_activity = user_groups.apply(
    lambda x: (last_moment - x.date_from.min()).days
).to_frame()
days_since_first_activity.rename(columns = {0: "days_since_first_activity"}, inplace=True)

In [112]:
days_since_last_activity = user_groups.apply(
    lambda x: (last_moment - x.date_from.max()).days
).to_frame()
days_since_last_activity.rename(columns = {0: "days_since_last_activity"}, inplace=True)

In [113]:
mean_duration_in_min = user_groups.apply(
    lambda x: x.duration_in_min.mean()
).to_frame()
mean_duration_in_min.rename(columns = {0: "mean_duration_in_min"}, inplace=True)

In [114]:
mean_distance_in_km = user_groups.apply(
    lambda x: x.distance_in_km.mean()
).to_frame()
mean_distance_in_km.rename(columns = {0: "mean_distance_in_km"}, inplace=True)

In [115]:
mean_speed_in_kmh = user_groups.apply(
    lambda x: x.speed_in_kmh.mean()
).to_frame()
mean_speed_in_kmh.rename(columns = {0: "mean_speed_in_kmh"}, inplace=True)

In [163]:
users = pd.merge(user_trip_dates, trips_count, left_index=True, right_index=True)
users = pd.merge(users, active_period_in_days, left_index=True, right_index=True)
users = pd.merge(users, days_since_first_activity, left_index=True, right_index=True)
users = pd.merge(users, days_since_last_activity, left_index=True, right_index=True)
users = pd.merge(users, mean_duration_in_min, left_index=True, right_index=True)
users = pd.merge(users, mean_distance_in_km, left_index=True, right_index=True)
users = pd.merge(users, mean_speed_in_kmh, left_index=True, right_index=True)

In [164]:
# Calculating the time since the last checkout of a user for each trip

def time_since_last_checkout(user_group):
    user_group["time_since_last_checkout"] = user_group.date_from.diff()
    return user_group

trips = trips.sort_values(by="date_from", ascending=True).groupby("user_id").apply(time_since_last_checkout)
trips.sample(5)

Unnamed: 0,bike_id,user_id,date_from,date_until,start_station_name,start_station_id,end_station_name,end_station_id,booked_via,duration_in_min,distance_in_km,speed_in_kmh,time_since_last_checkout
5038305,109042,C839C7C737F29856A6260CAACAE98CDF88312E3E,2016-05-24 15:12:16,2016-05-24 15:32:06,Meßberg / Willy-Brandt-Straße,131877,Königstraße / Struenseestraße,131650,iPhone SRH,20,3.736908,11.210723,0 days 05:54:44
6295880,106981,07EE6D753BA9CF5B10D297DB9A9C9B0711AF45AF,2016-10-02 23:48:16,2016-10-02 23:54:59,Sophienallee / Sandweg,243617,Langenfelder Damm/Müggenkampstraße,252485,Terminal HH_171 (-2136-),7,1.435628,12.30538,86 days 20:29:37
7376640,119876,74EFA5D7B32445126219815EC6E3336FFD3109DB,2017-04-15 18:01:20,2017-04-15 18:09:44,Gänsemarkt / Büschstraße,131884,Amsinckstraße/ Nordkanalbrücke,218174,Android SRH,9,1.748007,11.653379,0 days 19:50:31
6578255,108982,92DD068B4BD4944A5DA13D8DC9BE453DBAD6D6C7,2016-11-16 20:35:05,2016-11-16 20:54:06,Lange Reihe / Kirchenallee,138384,Innocentiapark/Oberstraße,131640,iPhone SRH,20,3.001597,9.004792,0 days 00:00:14
5683482,120682,D90CAC11D0512970B2AD1ADA8A26DD3B73B5C92E,2016-07-30 23:08:03,2016-07-30 23:26:00,Sternschanze / Eingang Dänenweg,139501,Emil-Andresen-Straße / Lohkoppelweg,243619,Android SRH,18,3.383133,11.277109,0 days 12:11:28


In [165]:
# Calculating the mean time between trips in hours

mean_time_between_trips = trips[["user_id", "time_since_last_checkout"]].dropna()
mean_time_between_trips = mean_time_between_trips[
    mean_time_between_trips.time_since_last_checkout > pd.Timedelta(3, "minutes")
]
mean_time_between_trips = mean_time_between_trips.groupby("user_id").mean(numeric_only=False)

users = pd.merge(
    users, mean_time_between_trips, left_index=True, right_index=True
).rename(columns={"time_since_last_checkout": "mean_time_between_trips"})

In [166]:
mean_time_between_trips.describe()

Unnamed: 0,time_since_last_checkout
count,192206
mean,43 days 20:03:54.292798739
std,83 days 01:30:23.224960791
min,0 days 00:03:01
25%,4 days 03:54:00.384615384
50%,17 days 00:44:17.947619047
75%,47 days 00:11:53.083333333
max,1220 days 01:31:06


5.0

In [167]:
trips[
    (trips.time_since_last_checkout > pd.Timedelta(2, "minutes")) &
    (trips.time_since_last_checkout < pd.Timedelta(3, "minutes"))
]

Unnamed: 0,bike_id,user_id,date_from,date_until,start_station_name,start_station_id,end_station_name,end_station_id,booked_via,duration_in_min,distance_in_km,speed_in_kmh,time_since_last_checkout
580,120167,929B8036595B2C92ECCB515D3E7B11829F1EB1B9,2014-01-01 01:33:50,2014-01-01 01:45:27,Lohmühlenstraße / Steindamm,140791,Wandsbeker Chaussee/Ritterstraße,204030,Terminal HH_4 (-2646-),12,1.968253,9.841267,0 days 00:02:02
1028,120286,1196A6C0217C5A151F7616519DE4CDC0414A3E60,2014-01-01 01:50:03,2014-01-01 02:09:02,Kümmellstraße / Robert-Koch-Straße,140794,Allende-Platz/Grindelhof,198077,IVR,19,2.506223,7.914389,0 days 00:02:59
472,119977,DEE63A21F241C42CDFF7021C59114E5474F47130,2014-01-01 02:51:24,2014-01-01 03:09:21,Bahnhof Dammtor Süd / Marseiller Straße,138382,Eimsbütteler Straße/Waterloostraße,131644,iPhone SRH,18,2.491760,8.305867,0 days 00:02:05
176,117568,5E1F90EA9C7DC1BF48F0F52813164C6C1B08352F,2014-01-01 02:55:56,2014-01-01 03:11:20,Schulterblatt/Eifflerstraße,131648,Millerntorplatz/St.Pauli,131905,Terminal HH_5 (-2132-),16,1.442698,5.410117,0 days 00:02:36
257,119836,18DCDD921D3D1082AB4DC92F93B76E135459C47A,2014-01-01 03:08:23,2014-01-01 03:24:25,Alsenstraße/Düppelstraße,211706,Fischersallee/Bleickenallee,211711,iPhone SRH,17,2.189323,7.727022,0 days 00:02:44
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7603679,143579,CE63C4DE4B79D5F71968CC16C312F6AB1FF6223F,2017-05-15 20:33:36,2017-05-15 20:40:21,Ottenser Marktplatz/Museumsstraße,131645,Neumühlen/Övelgönne,213856,iPhone SRH,7,1.392231,11.933410,0 days 00:02:09
7604137,143694,FD69D4563EC9EE036F637DB76D4EE656617288C3,2017-05-15 21:05:02,2017-05-15 21:12:44,Thadenstraße/Holstenstraße,252492,Bahnhof Altona Ost/Max-Brauer-Allee,131646,iPhone SRH,8,1.058138,7.936036,0 days 00:02:53
7605914,143772,29DF9BC7C5582F28A33B959778D3532738593728,2017-05-15 21:36:41,2017-05-15 21:49:46,Thadenstraße/Holstenstraße,252492,St.Petersburger Straße/Bei den Kirchhöfen,213235,Unknown,14,2.108987,9.038516,0 days 00:02:07
7604838,143736,EB321D3A4F96305BDB4235B46B0BC87F7B420DBA,2017-05-15 23:58:25,2017-05-16 00:11:50,Hauptbahnhof Ost / Hachmannplatz,131873,Feldstraße / Marktstraße,244093,Terminal HH_34 (-2541-),14,2.573083,11.027499,0 days 00:02:06


In [168]:
# Labelling users as 'casual' or 'regular' based on activity patterns.

REGULAR_USER_ACTIVE_DAYS_MIN = 28
REGULAR_USER_THRESHOLD = 4

def label_user_type(user):
    if (
        (user.active_period_in_days / user.trips_count) <= REGULAR_USER_THRESHOLD
    ) & (
        user.active_period_in_days >= REGULAR_USER_ACTIVE_DAYS_MIN
    ):
        return "regular"
    else:
        return "casual"

users["type"] = users.apply(label_user_type, axis=1)

In [169]:
users.sample(5)

Unnamed: 0_level_0,date_from_min,date_from_max,trips_count,active_period_in_days,days_since_first_activity,days_since_last_activity,mean_duration_in_min,mean_distance_in_km,mean_speed_in_kmh,mean_time_between_trips,type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
23EE85EE0C9A591E71110DA6599BD0C4D655211E,2014-04-11 20:47:41,2015-06-26 07:17:48,9,440,1130,690,13.222222,1.994505,8.917648,55 days 01:18:45.875000,casual
57E1553A31C740D1E3A32692232799EA6389CD62,2014-03-14 11:50:27,2014-06-04 00:30:03,4,81,1158,1077,10.25,2.154657,12.900318,27 days 04:13:12,casual
E291E84431AACD8E38D85FD02A6ABD26CF4F17E8,2014-04-05 12:01:59,2016-02-23 13:11:56,9,689,1136,447,7.888889,0.944375,7.401147,86 days 03:08:44.625000,casual
69155EA866BD332DF9093DB3B6CE29A70D97541E,2016-09-14 22:34:11,2017-05-12 13:56:47,131,239,243,3,7.48855,1.019565,8.509161,1 days 20:55:57.257812500,regular
F898DBC7AE31C30CADA3E5D83F4AAF1A3A9D9A12,2014-05-14 12:13:43,2014-08-25 23:49:50,13,103,1097,994,12.769231,2.269519,11.053081,8 days 14:58:00.583333333,casual


In [170]:
users.to_csv("../data/clean/users.csv", index=True)