In [None]:
import os
from pathlib import Path

import pandas as pd

In [None]:
data_dir = Path.cwd().parent / "data" / "raw"
fname = "train_labels.csv"
fpath = data_dir / fname

# df = pd.read_csv(fpath, nrows=10)
# print(df)

test_set_path = data_dir / "test"
test_set_image_names = os.listdir(test_set_path)
n_test = len(test_set_image_names)
print(f"Test images: {n_test:,}")
print(test_set_path / test_set_image_names[0])


train_set_path = data_dir / "train"
train_set_image_names = os.listdir(train_set_path)
n_train = len(train_set_image_names)
print(f"Train images: {n_train:,}")
print(train_set_path / train_set_image_names[0])

n_total = n_test + n_train
print(f"Total images: {n_total:,}")

In [None]:
from datetime import timedelta
pd.date_range("06/15/2022", periods=5)  + timedelta(days=10)

In [None]:

pd.date_range("06/15/2022", periods=5) + pd.to_timedelta([12, 13, 1, 2, 4], unit="d")

In [None]:
pd.to_datetime("1/3/2012 16:00:00")

In [None]:
df = pd.DataFrame(
  {
    "str1": ["hi", "bye", "guy"],
    "str2": ["h1", "by3", "9uy"],
  }
)
df   

In [None]:
df["str1"].str.cat(df["str2"], sep=" ")

In [None]:
string = """
1/3/2012 16:00:00   Missing_1
1/4/2012 16:00:00   27.47
1/5/2012 16:00:00   27.728
1/6/2012 16:00:00   28.19
"""

In [None]:
string.split(" ")

In [None]:
string.split("\n")

In [None]:
for i in df["val"]:
  display(i, i.isdecimal())

In [None]:
df["val"].str.strip().str.isdecimal()

In [None]:
import io

df = pd.read_csv(io.StringIO(string), delim_whitespace=True, header=None)

def append_raw_cols(df):
  raw_columns = ["date", "time", "val"]
  df.columns = raw_columns 
  
  return df

def concat_cols(df):
  df.loc[:, "datetime"] = df["date"].str.cat(df["time"], sep=" ")
  
  return df

def cast_to_date(df):
  df.loc[:, "datetime"] = pd.to_datetime(df["datetime"])
  
  return df

def is_float(val):
  try:
    float(val)
    return True
  except:
    return False

def remove_missing(df):
  df.loc[:, "is_val_float"] = df["val"].apply(is_float)
  df.loc[:, "val"] = df.apply(lambda row: row["val"] if row["is_val_float"] else None, axis="columns")
  
  return df.drop("is_val_float", axis="columns")

def reindex_to_time(df):
  df.index = df["datetime"]
  
  return df.drop(["datetime", "date", "time"], axis="columns")

def fill_val(df):
  df.loc[:, "val"] = df["val"].interpolate()
  df.loc[:, "val"] = df["val"].interpolate(method="bfill")
  df.loc[:, "val"] = df["val"].interpolate(method="bffill")
  
  return df

(
  df.pipe(append_raw_cols)
  .pipe(concat_cols)
  .pipe(cast_to_date)
  .pipe(remove_missing)
  .pipe(reindex_to_time)
  .pipe(fill_val)
)

In [None]:
help(pd.read_table)

In [None]:
[func for func in dir(pd) if func.startswith("read_")]