# Combine district CSVs into a timeseries file

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import glob
import os
import json

### Get all the district json files

In [3]:
path = ""
files = glob.glob(os.path.join(path, "data/districts-2021*.csv"))

### Read them and create a date field

In [4]:
file_df = (
    pd.read_csv(f, low_memory=False).assign(date=os.path.basename(f)) for f in files
)

### Concatenate all the files

In [5]:
concatenated_df = pd.concat(
    file_df,
    ignore_index=True,
)

### Clean up the date field and column names

In [6]:
concatenated_df["date"] = (
    concatenated_df["date"]
    .str.replace("districts-", "", regex=False)
    .str.replace(".csv", "", regex=False)
)

In [7]:
concatenated_df.columns = (
    concatenated_df.columns.str.replace(".", "", regex=False)
    .str.lower()
    .str.replace("attributes", "", regex=False)
)

### Cleaning up the unix dates

In [8]:
dates = [
    "creationdate",
    "near_term_opening_date",
    "elem_reopening_date",
    "middle_reopening_date",
    "high_reopening_date",
]

In [9]:
def parse_dates(df, dates):
    for d in dates:
        df[f"{d}"] = pd.to_datetime(df[f"{d}"], unit="ms")
        df[f"{d}_clean"] = df[f"{d}"].dt.date
    return df

In [10]:
df = parse_dates(concatenated_df, dates)

---

## Export

In [12]:
df.to_csv("data/districts-all-timeseries.csv", index=False)