# Parsing datetime strings

In [1]:
import polars as pl

In [2]:
csv_file = "data/nyc_trip_data_1k.csv"

## Reading datetime strings from a CSV
Polars does not try to parse datetimes from strings by default

In [3]:
df = pl.read_csv(
    csv_file
)

df.head(2)

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,str,str,f64,f64,f64,f64
"""id1""","""2022-01-01T00:04:14.000000""","""2022-01-01T00:26:12.000000""",1.0,10.83,31.0,0.0
"""id2""","""2022-01-01T00:32:17.000000""","""2022-01-01T00:49:23.000000""",1.0,3.97,14.5,3.66


Set `try_parse_dates=True` to get the correct datetime dtype

In [4]:
df = pl.read_csv(
    csv_file,
    try_parse_dates=True
)

df.head(2)

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,datetime[μs],datetime[μs],f64,f64,f64,f64
"""id1""",2022-01-01 00:04:14,2022-01-01 00:26:12,1.0,10.83,31.0,0.0
"""id2""",2022-01-01 00:32:17,2022-01-01 00:49:23,1.0,3.97,14.5,3.66


`schema_overrides` has the same effect

In [5]:
df = pl.read_csv(
    csv_file,
    schema_overrides={
        "pickup":pl.Datetime,
        "dropoff":pl.Datetime
    }
)

df.head(2)

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,datetime[μs],datetime[μs],f64,f64,f64,f64
"""id1""",2022-01-01 00:04:14,2022-01-01 00:26:12,1.0,10.83,31.0,0.0
"""id2""",2022-01-01 00:32:17,2022-01-01 00:49:23,1.0,3.97,14.5,3.66


## Reading datetime strings from a CSV in lazy mode

In [6]:
print(
    pl.scan_csv(
        csv_file,
        try_parse_dates=True
    ).explain()
)

Csv SCAN [data/nyc_trip_data_1k.csv]
PROJECT */7 COLUMNS
ESTIMATED ROWS: 984


### Other file types
CSV files store all data as strings. 

However, `IPC (Arrow)` and `Parquet` files store the datetime dtypes. 

JSON doesn't have `parse_dates` argument and the conversion must be done manually after the JSON read. 

## Parsing dates manually

`.str.strptime` 

In [7]:
df=pl.read_csv(csv_file)
df.head(2)

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,str,str,f64,f64,f64,f64
"""id1""","""2022-01-01T00:04:14.000000""","""2022-01-01T00:26:12.000000""",1.0,10.83,31.0,0.0
"""id2""","""2022-01-01T00:32:17.000000""","""2022-01-01T00:49:23.000000""",1.0,3.97,14.5,3.66


In [8]:
df.with_columns(
    pl.col("pickup").str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.6f"),
    pl.col("dropoff").str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.6f")
).head(2)

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,datetime[μs],datetime[μs],f64,f64,f64,f64
"""id1""",2022-01-01 00:04:14,2022-01-01 00:26:12,1.0,10.83,31.0,0.0
"""id2""",2022-01-01 00:32:17,2022-01-01 00:49:23,1.0,3.97,14.5,3.66


There are also some short-cut formats e.g. `%F` for `%Y-%m-%d` and `%T` for `%H:%M:%S`

In [9]:
df.with_columns(
    pl.col("pickup").str.strptime(pl.Datetime, format="%FT%T%.6f")
).head(2)

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,datetime[μs],str,f64,f64,f64,f64
"""id1""",2022-01-01 00:04:14,"""2022-01-01T00:26:12.000000""",1.0,10.83,31.0,0.0
"""id2""",2022-01-01 00:32:17,"""2022-01-01T00:49:23.000000""",1.0,3.97,14.5,3.66


## Saving datetimes
If we write a datetime dtype to `IPC` or `Parquet` file, datetime dtype will be preserved.

If we write to a `CSV` then the datetime is converted back to a string

In [10]:
df = pl.read_csv(csv_file)

df_formatted = df.with_columns(
    pl.col("pickup").str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.6f"),
    pl.col("dropoff").str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.6f"),
)
df_formatted.head(2)

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,datetime[μs],datetime[μs],f64,f64,f64,f64
"""id1""",2022-01-01 00:04:14,2022-01-01 00:26:12,1.0,10.83,31.0,0.0
"""id2""",2022-01-01 00:32:17,2022-01-01 00:49:23,1.0,3.97,14.5,3.66


Before write the datetime data, we can adjust the datetime format

In [11]:
df_formatted.write_csv(
    "test.csv",
    datetime_format="%Y-%m-%d%T%H:%M:%S"
)

## Duration dtype

`pl.Duration` type is not allowed in CSV file.

Instead we extract the underlying integer representation in `microseconds`.

In [12]:
df_formatted.with_columns(
    ((pl.col("dropoff") - pl.col("pickup")).dt.total_seconds()).alias("trip_length_seconds")
).head(2)

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount,trip_length_seconds
str,datetime[μs],datetime[μs],f64,f64,f64,f64,i64
"""id1""",2022-01-01 00:04:14,2022-01-01 00:26:12,1.0,10.83,31.0,0.0,1318
"""id2""",2022-01-01 00:32:17,2022-01-01 00:49:23,1.0,3.97,14.5,3.66,1026


## Exercises

### Exercise 1

Parse the dates

Convert the `date` strings to `pl.Date` dtype

In [15]:
df = pl.DataFrame(
    {'date':['31-01-2020','28-02-2020','31-03-2020']}
)

df.with_columns(
        pl.col('date').str.strptime(dtype=pl.Date, format="%d-%m-%Y")
    )

date
date
2020-01-31
2020-02-28
2020-03-31


With YMD format

In [None]:
df = pl.DataFrame({"date": ["2020-01-31", "2020-02-28", "2020-03-31"]})

df.with_columns(pl.col("date").str.strptime(dtype=pl.Date, format="%Y-%m-%d"))

date
date
2020-01-31
2020-02-28
2020-03-31


With forward-slashes

In [None]:
df = pl.DataFrame({"date": ["31/01/2020", "28/02/2020", "31/03/2020"]})

df.with_columns(pl.col("date").str.strptime(dtype=pl.Date, format="%d/%m/%Y"))

date
date
2020-01-31
2020-02-28
2020-03-31


With month names.

Recall the [formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)

In [None]:
df = pl.DataFrame({"date": ["27 July 2020", "31 December 2020"]})

df.with_columns(pl.col("date").str.strptime(pl.Date, format="%d %B %Y"))

date
date
2020-07-27
2020-12-31


### Exercise 2 

Parse the datetimes

Convert the `date` column from string to `pl.Datetime` dtype

In [19]:
df = pl.DataFrame(
    {"date": ["31-01-2020 00:00:00", "28-02-2020 00:00:00", "31-03-2020 00:00:00"]}
)

df.with_columns(pl.col("date").str.strptime(pl.Datetime, format="%d-%m-%Y %H:%M:%S"))

date
datetime[μs]
2020-01-31 00:00:00
2020-02-28 00:00:00
2020-03-31 00:00:00


Convert to `pl.Datetime` preserving the milliseconds

In [21]:
df = pl.DataFrame(
    {
        "date": [
            "31-01-2020 00:00:00.500",
            "31-01-2020 00:00:00.600",
            "31-01-2020 00:00:00.700",
        ]
    }
)

df.with_columns(
    pl.col("date").str.strptime(pl.Datetime, format="%d-%m-%Y %H:%M:%S%.3f")
)

date
datetime[ms]
2020-01-31 00:00:00.500
2020-01-31 00:00:00.600
2020-01-31 00:00:00.700


Convert strings with AM/PM to `pl.Datetime` dtype

In [22]:
df = pl.DataFrame(
    {"date": ["01-01-2020 01:00 AM", "01-02-2020 01:00 AM", "01-03-2020 02:00 AM"]}
)

df.with_columns(pl.col("date").str.strptime(pl.Datetime, format="%d-%m-%Y %I:%M %p"))

date
datetime[μs]
2020-01-01 01:00:00
2020-02-01 01:00:00
2020-03-01 02:00:00


### Exercise 3 

Parse datetimes from a CSV.

Read in the NYC taxi dataset from the CSV file. 

Use `read_csv` to parse the dates automatically

In [25]:
dfNYC = pl.read_csv(csv_file, try_parse_dates=True)

dfNYC.head(2)

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,datetime[μs],datetime[μs],f64,f64,f64,f64
"""id1""",2022-01-01 00:04:14,2022-01-01 00:26:12,1.0,10.83,31.0,0.0
"""id2""",2022-01-01 00:32:17,2022-01-01 00:49:23,1.0,3.97,14.5,3.66


Change the pickup and dropoff columns to be `pl.Date`

Challenge: do this in a single expression using `with_column`

In [27]:
dfNYC = pl.read_csv(csv_file, try_parse_dates=True).with_columns(
    pl.col("pickup", "dropoff").cast(pl.Date())
)

dfNYC

VendorID,pickup,dropoff,passenger_count,trip_distance,fare_amount,tip_amount
str,date,date,f64,f64,f64,f64
"""id1""",2022-01-01,2022-01-01,1.0,10.83,31.0,0.0
"""id2""",2022-01-01,2022-01-01,1.0,3.97,14.5,3.66
"""id8""",2022-01-01,2022-01-01,4.0,8.44,25.5,0.0
"""id0""",2022-01-01,2022-01-01,1.0,12.61,37.5,12.39
"""id1""",2022-01-01,2022-01-01,1.0,1.49,6.5,0.0
…,…,…,…,…,…,…
"""id0""",2022-01-14,2022-01-14,1.0,1.07,9.0,0.0
"""id4""",2022-01-14,2022-01-14,2.0,5.57,18.0,5.58
"""id2""",2022-01-14,2022-01-14,3.0,0.92,5.5,2.45
"""id0""",2022-01-14,2022-01-14,0.0,0.8,5.0,2.3
