In [None]:
from datetime import datetime, timedelta
import os
from pathlib import Path
import pandas as pd


def duration_str(duration: timedelta):
    """
    Use total seconds to convert to a datetime and format as a string e.g. 01:30
    """
    return datetime.fromtimestamp(duration.total_seconds()).strftime("%H:%M")


DATA_DIR = Path("./data")
DATA_SOURCE = Path(os.environ.get("HARVEST_DATA", "./data/harvest-sample.csv"))

In [None]:
# assign category dtype for efficiency on repeating text columns
dtypes = {
    "Client": "category",
    "Project": "category",
    "First Name": "category",
    "Last Name": "category",
}
# skip reading the columns we don't care about for Toggl
cols = list(dtypes) + [
    "Date",
    "Notes",
    "Hours",
]
# read CSV file, parsing dates
source = pd.read_csv(DATA_SOURCE, dtype=dtypes, usecols=cols, parse_dates=["Date"], cache_dates=True)
source.dtypes

In [None]:
# rename columns that can be imported as-is
source.rename(columns={"Project": "Task", "Notes": "Description", "Date": "Start Date"}, inplace=True)
source.dtypes

In [None]:
# update static calculated columns
source["Client"] = "Xentrans"
source["Client"] = source["Client"].astype("category")
source["Project"] = "Xentrans"
source["Project"] = source["Project"].astype("category")
source["Billable"] = "Yes"
source["Billable"] = source["Billable"].astype("category")
source.dtypes

In [None]:
# add the Email column
source["Email"] = source["First Name"].apply(lambda x: f"{x.lower()}@compiler.la").astype("category")
# drop individual name columns
source.drop(columns=["First Name", "Last Name"], inplace=True)
source.dtypes

In [None]:
# Convert numeric Hours to string Duration
source["Duration"] = source["Hours"].apply(
    # first convert the numeric hours e.g. 1.5 to a timedelta
    lambda x: duration_str(pd.to_timedelta(x, unit="hours"))
)