In [1]:
import pandas as pd
import numpy as np

print("Pandas Version: " + pd.__version__)
print("Numpy Version: " + np.__version__)

Pandas Version: 1.2.4
Numpy Version: 1.18.5


In [16]:
trips = pd.read_csv("../data/clean/trips.csv", parse_dates=["date_from", "date_until"])
trips.head(3)

Unnamed: 0,bike_id,user_id,date_from,date_until,start_station_name,start_station_id,end_station_name,end_station_id,booked_via,duration_in_min,distance_in_km,speed_in_kmh
0,143517,A821059B555C7764A2FF801180874A2FCB326222,2014-01-01 00:34:54,2014-01-01 00:50:14,U-Bahn Baumwall,214170,Mönckebergstraße / Rosenstraße,131880,iPhone SRH,16,1.293661,4.851229
1,119830,1EBC930DB407ACEAE2FDE23A6CA40492EA3DFBB2,2014-01-01 01:39:55,2014-01-01 01:57:27,Bahnhof Altona Ost/Max-Brauer-Allee,131646,Schulterblatt/Eifflerstraße,131648,Android SRH,18,2.032271,6.774236
2,143501,7AD2C1B70137479062A6DD73815835986677BB2D,2014-01-01 01:40:20,2014-01-01 01:53:09,Weidestraße/Biedermannplatz,211922,Jarrestraße / Rambatzweg,138376,Techniker HH_119 (-2334-),13,0.954178,4.403899


In [3]:
user_groups = trips.groupby(["user_id"])

In [4]:
last_moment = trips.date_until.max()

In [5]:
user_trip_dates = trips.groupby(["user_id"], as_index=False).agg({"date_from": [np.min, np.max]})
user_trip_dates.columns = pd.Index(["user_id", "date_from_min", "date_from_max"])
user_trip_dates.set_index("user_id", inplace=True)

In [6]:
trips_count = user_groups.size().to_frame()
trips_count.rename(columns = {0: "trips_count"}, inplace=True)

In [7]:
active_period_in_days = user_groups.apply(
    lambda x: (x.date_from.max() - x.date_from.min()).days
).to_frame()
active_period_in_days.rename(columns = {0: "active_period_in_days"}, inplace=True)

In [8]:
days_since_first_activity = user_groups.apply(
    lambda x: (last_moment - x.date_from.min()).days
).to_frame()
days_since_first_activity.rename(columns = {0: "days_since_first_activity"}, inplace=True)

In [9]:
days_since_last_activity = user_groups.apply(
    lambda x: (last_moment - x.date_from.max()).days
).to_frame()
days_since_last_activity.rename(columns = {0: "days_since_last_activity"}, inplace=True)

In [10]:
mean_duration_in_min = user_groups.apply(
    lambda x: x.duration_in_min.mean()
).to_frame()
mean_duration_in_min.rename(columns = {0: "mean_duration_in_min"}, inplace=True)

In [11]:
mean_distance_in_km = user_groups.apply(
    lambda x: x.distance_in_km.mean()
).to_frame()
mean_distance_in_km.rename(columns = {0: "mean_distance_in_km"}, inplace=True)

In [12]:
mean_speed_in_kmh = user_groups.apply(
    lambda x: x.speed_in_kmh.mean()
).to_frame()
mean_speed_in_kmh.rename(columns = {0: "mean_speed_in_kmh"}, inplace=True)

In [13]:
users = pd.merge(user_trip_dates, trips_count, left_index=True, right_index=True)
users = pd.merge(users, active_period_in_days, left_index=True, right_index=True)
users = pd.merge(users, days_since_first_activity, left_index=True, right_index=True)
users = pd.merge(users, days_since_last_activity, left_index=True, right_index=True)
users = pd.merge(users, mean_duration_in_min, left_index=True, right_index=True)
users = pd.merge(users, mean_distance_in_km, left_index=True, right_index=True)
users = pd.merge(users, mean_speed_in_kmh, left_index=True, right_index=True)

In [None]:
# Calculate downtime beetween two trips of a user

def time_since_last_checkout(user_group):
    user_group["time_since_last_checkout"] = user_group.date_from.diff()
    return user_group

trips = trips.sort_values(by="date_from", ascending=True).groupby("user_id").apply(time_since_last_checkout)

In [14]:
# Calculating the mean time between trips in hours
# Does only include downtimes that are longer than 3 Minutes

mean_time_between_trips = trips[["user_id", "time_since_last_checkout"]].dropna()
mean_time_between_trips = mean_time_between_trips[
    mean_time_between_trips.time_since_last_checkout > pd.Timedelta(3, "minutes")
]
mean_time_between_trips = mean_time_between_trips.groupby("user_id").mean(numeric_only=False)

users = pd.merge(
    users, mean_time_between_trips, left_index=True, right_index=True
).rename(columns={"time_since_last_checkout": "mean_time_between_trips"})

KeyError: "['time_since_last_checkout'] not in index"

In [None]:
mean_time_between_trips.describe()

In [None]:
users["trips_per_day"] = users["active_period_in_days"] / users["trips_count"]

In [None]:
# Labelling users as 'casual' or 'regular' based on activity patterns.

REGULAR_USER_ACTIVE_DAYS_MIN = 28
REGULAR_USER_THRESHOLD = 4

def label_user_type(user):
    if (
        user.trips_per_day <= REGULAR_USER_THRESHOLD
    ) & (
        user.active_period_in_days >= REGULAR_USER_ACTIVE_DAYS_MIN
    ):
        return "regular"
    else:
        return "casual"

users["type"] = users.apply(label_user_type, axis=1)

In [None]:
users.sample(5)

In [None]:
users.to_csv("../data/clean/users.csv", index=True)