In [1]:
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import pytz

In [2]:
df = pd.read_excel("/mnt/c/Users/riguy/Downloads/AmITooSweet.xlsx")

In [3]:
df = df[['Date', 'Time', 'Type', 'Units', 'Notes']]

In [4]:
def fix_timezones(s):
    if pd.isna(s):
        return s
    s = s.lower()
    if 'time zone' in s:
        return s.replace('time zone', '').strip().upper()
    return np.nan

df.Date = df.Date.interpolate(method='pad')

df["timezone"] = df.Notes.apply(fix_timezones)
df.at[0, 'timezone'] = 'PST' # fix first value

df['had_timezone'] = pd.isna(df.timezone) == False

df.timezone = df.timezone.interpolate(method='pad')

In [5]:
df.head()

Unnamed: 0,Date,Time,Type,Units,Notes,timezone,had_timezone
0,2022-11-07,19:57:00,Lantis,10.0,New pen,PST,True
1,2022-11-08,19:52:00,Lantis,10.0,,PST,False
2,2022-11-09,19:26:00,Lantis,10.0,Hurt. 4mm needle. Pulled out early?,PST,False
3,2022-11-10,19:57:00,Lantis,10.0,,PST,False
4,2022-11-11,19:12:00,Lantis,11.0,5mm needle,PST,False


In [7]:
tz_correct = {
    'PST': 'America/Los_Angeles'
}

def build_timestamp(row):
    s = f"{row['Date'].strftime('%Y-%m-%d')}T{row['Time'].isoformat()}"
    tz = pytz.timezone(tz_correct.get(row['timezone'], row['timezone']))
    d = datetime.strptime(s, '%Y-%m-%dT%H:%M:00')
    return tz.localize(d)
    
df['timestamp'] = df.apply(build_timestamp, axis=1)

In [8]:
df_tz_points = df[df.had_timezone]
df_tz_points = df_tz_points[['timezone', 'timestamp']]
df_tz_points

Unnamed: 0,timezone,timestamp
0,PST,2022-11-07 19:57:00-08:00
20,EST,2022-11-18 20:48:00-05:00


In [9]:
df = df.drop(columns='had_timezone')

In [10]:
# This is a sanity check that our values are in order. 
# I've caught a few data errors this way:
#  * wrong TZ info
#  * bad date format
#  * forgot to use 24 time.
assert df.timestamp.diff().min() > timedelta(0)

AssertionError: 