In [1]:
import pandas as pd
import numpy as np

print("Pandas Version: " + pd.__version__)
print("Numpy Version: " + np.__version__)

Pandas Version: 1.2.0
Numpy Version: 1.18.5


In [2]:
trips = pd.read_csv("../data/clean/trips.csv", parse_dates=["date_from", "date_until"])
trips.head(3)

Unnamed: 0,bike_id,user_id,date_from,date_until,start_station_name,start_station_id,end_station_name,end_station_id,booked_via,duration_in_min,distance_in_km,speed_in_kmh
0,143517,A821059B555C7764A2FF801180874A2FCB326222,2014-01-01 00:34:54,2014-01-01 00:50:14,U-Bahn Baumwall,214170,Mönckebergstraße / Rosenstraße,131880,iPhone SRH,16,1.293661,4.851229
1,119830,1EBC930DB407ACEAE2FDE23A6CA40492EA3DFBB2,2014-01-01 01:39:55,2014-01-01 01:57:27,Bahnhof Altona Ost/Max-Brauer-Allee,131646,Schulterblatt/Eifflerstraße,131648,Android SRH,18,2.032271,6.774236
2,143501,7AD2C1B70137479062A6DD73815835986677BB2D,2014-01-01 01:40:20,2014-01-01 01:53:09,Weidestraße/Biedermannplatz,211922,Jarrestraße / Rambatzweg,138376,Techniker HH_119 (-2334-),13,0.954178,4.403899


In [8]:
user_groups = trips.groupby(["user_id"])

In [14]:
last_moment = trips.date_until.max()

In [15]:
user_trip_dates = trips.groupby(["user_id"], as_index=False).agg({"date_from": [np.min, np.max]})
user_trip_dates.columns = pd.Index(["user_id", "date_from_min", "date_from_max"])
user_trip_dates.set_index("user_id", inplace=True)

In [16]:
trips_count = user_groups.size().to_frame()
trips_count.rename(columns = {0: "trips_count"}, inplace=True)

In [19]:
active_period_in_days = user_groups.apply(
    lambda x: (x.date_from.max() - x.date_from.min()).days
).to_frame()
active_period_in_days.rename(columns = {0: "active_period_in_days"}, inplace=True)

In [20]:
days_since_first_activity = user_groups.apply(
    lambda x: (last_moment - x.date_from.min()).days
).to_frame()
days_since_first_activity.rename(columns = {0: "days_since_first_activity"}, inplace=True)

In [21]:
days_since_last_activity = user_groups.apply(
    lambda x: (last_moment - x.date_from.max()).days
).to_frame()
days_since_last_activity.rename(columns = {0: "days_since_last_activity"}, inplace=True)

In [22]:
mean_duration_in_min = user_groups.apply(
    lambda x: x.duration_in_min.mean()
).to_frame()
mean_duration_in_min.rename(columns = {0: "mean_duration_in_min"}, inplace=True)

In [23]:
mean_distance_in_km = user_groups.apply(
    lambda x: x.distance_in_km.mean()
).to_frame()
mean_distance_in_km.rename(columns = {0: "mean_distance_in_km"}, inplace=True)

In [24]:
mean_speed_in_kmh = user_groups.apply(
    lambda x: x.speed_in_kmh.mean()
).to_frame()
mean_speed_in_kmh.rename(columns = {0: "mean_speed_in_kmh"}, inplace=True)

In [25]:
def calculate_mean_time_between_trips_in_hours(group):
    sorted_group = group.sort_values(by="date_from").reset_index()
    total_in_seconds = 0
    for index, row in sorted_group.iterrows():
        if (index != sorted_group.index[-1]):
            next_trip = sorted_group.iloc[index+1]
            delta = next_trip.date_from - row.date_from
            total_in_seconds = total_in_seconds + delta.seconds
    total_in_hours = total_in_seconds / 3600
    return total_in_hours/len(sorted_group)

# TODO can this be faster?
#mean_time_between_trips_in_hours = user_groups.apply(calculate_mean_time_between_trips_in_hours).to_frame()
#mean_time_between_trips_in_hours.rename(columns = {0: "mean_time_between_trips_in_hours"}, inplace=True)

In [26]:
users = pd.merge(user_trip_dates, trips_count, left_index=True, right_index=True)
users = pd.merge(users, active_period_in_days, left_index=True, right_index=True)
users = pd.merge(users, days_since_first_activity, left_index=True, right_index=True)
users = pd.merge(users, days_since_last_activity, left_index=True, right_index=True)
users = pd.merge(users, mean_duration_in_min, left_index=True, right_index=True)
users = pd.merge(users, mean_distance_in_km, left_index=True, right_index=True)
users = pd.merge(users, mean_speed_in_kmh, left_index=True, right_index=True)
#users = pd.merge(users, mean_time_between_trips_in_hours, left_index=True, right_index=True)
users

Unnamed: 0_level_0,date_from_min,date_from_max,trips_count,active_period_in_days,days_since_first_activity,days_since_last_activity,mean_duration_in_min,mean_distance_in_km,mean_speed_in_kmh
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0000020B772C8D71F4F9AE4835D6C9F032FDBE21,2014-08-03 00:43:54,2017-05-12 19:42:34,66,1013,1017,3,12.393939,1.659699,8.258637
000004354587D5BAC99A2171DE2F4ACD30AA8635,2014-03-10 08:42:55,2016-11-17 12:11:25,8,983,1162,179,15.625000,0.953377,3.693612
00005721E978458AD348C24219138429BC84574A,2016-06-19 21:02:32,2017-05-15 16:05:15,99,329,330,0,10.252525,1.439436,8.760295
0000746235E82AD76B690086FFEB6D55A32C61C8,2014-06-25 21:40:19,2017-05-15 15:46:03,10,1054,1055,0,14.500000,1.792505,7.695143
000080D6DF2166894262ADFBADD581AB6A20E481,2015-03-08 14:10:26,2015-03-08 14:54:02,3,0,799,799,22.333333,1.047813,2.977546
...,...,...,...,...,...,...,...,...,...
FFFEEF2473DD8F9E327D8054332B3D127C835CCD,2014-03-03 20:22:26,2017-05-15 17:13:59,103,1168,1169,0,15.407767,2.607010,9.976492
FFFF0AF80E4B20974E4F0FEDB0C1AC36BC145469,2014-06-22 01:35:38,2014-08-02 00:03:10,2,40,1059,1018,21.000000,3.077671,9.323773
FFFF6F2B370F6D7BFB02D4C05B333622358735A2,2014-04-24 14:10:05,2014-04-30 18:37:55,4,6,1117,1111,17.250000,2.866037,9.992602
FFFFB52F01C697F11E9245BFC790A803EC1F9156,2016-09-13 18:09:19,2017-05-04 18:44:01,28,233,244,11,16.000000,1.410046,5.553044


In [27]:
users.to_csv("../data/clean/users.csv", index=True)