# Run this notebook before the ML notebook
* This notebook requires down_up_lat_tests_by_quarter_part[1-5].csv in the current directory
* This notebook reads chunks of raw data and prepares a large feature_table 
* Runtime about 2 minutes

In [1]:
import pandas as pd
import numpy as np
import geopandas as gp
import matplotlib.pyplot as plt

In [2]:
time_data = pd.DataFrame()
number_of_chunks = 5
for idx in range(1, number_of_chunks+1):
    time_data = pd.concat([time_data, pd.read_csv(f"down_up_lat_tests_by_quarter_part{idx}.csv")])

In [3]:
print(time_data.columns)
print(time_data.shape)
cols_to_drop = ['geometry','DAUID','PRUID','CDUID','CCSUID','CCSNAME','CSDUID','ERUID','SACCODE','CMAUID','CMAPUID','CMANAME','CMATYPE','CTUID','CTNAME','ADAUID','PCUID','PCPUID']
time_data = time_data.drop(columns=cols_to_drop).reset_index(drop=True)
print(time_data.shape)

Index(['quadkey', 'geometry', 'd2019Q1', 'u2019Q1', 'l2019Q1', 't2019Q1',
       'd2019Q2', 'u2019Q2', 'l2019Q2', 't2019Q2',
       ...
       'tile_area', 'tile_frac', 'das_frac', 'DAPOP', 'POP_DENSITY', 'PCUID',
       'PCNAME', 'PCTYPE', 'PCPUID', 'PCCLASS'],
      dtype='object', length=103)
(581765, 103)
(581765, 85)


In [4]:
down_cols = [col for col in time_data if col.startswith('d20')]
up_cols = [col for col in time_data if col.startswith('u20')]
lat_cols = [col for col in time_data if col.startswith('l20')]
tests_cols = [col for col in time_data if col.startswith('t20')]

In [5]:
# Finds n previous quarters given the curent year(y) and quarter(q)
def training_quarters(y, q, n):
    quarters = []
    i=0
    while i<5:
        if q<=i:
            i = 0
            y = y-1
            q=4
        else:
            quarters.append(str(y) + "Q" + str(q-i))
            i=i+1
            if len(quarters) == n:
                return quarters

In [6]:
# Select a quarter as target and add corresponding features using def training_quarters()
feature_table = pd.DataFrame()
print(feature_table.shape)

n_train_quarter = 8
n_quarter = n_train_quarter+1

for year in range(2021, 2023):
    new_table = time_data.drop(columns=down_cols+up_cols+lat_cols+tests_cols)
    for quarter in range(1, 5):
        quarters = training_quarters(year, quarter, n_quarter)
        for i in range(n_quarter):
            new_table["d_quarter"+str(i)] = time_data["d"+quarters[i]]
            new_table["u_quarter"+str(i)] = time_data["u"+quarters[i]]
            new_table["l_quarter"+str(i)] = time_data["l"+quarters[i]]
            new_table["t_quarter"+str(i)] = time_data["t"+quarters[i]]

        feature_table = pd.concat([feature_table, new_table], ignore_index=True)
        print(feature_table.shape)

year=2023
quarter=1
new_table = time_data.drop(columns=down_cols+up_cols+lat_cols+tests_cols)
quarters = training_quarters(year, quarter, n_quarter)
for i in range(n_quarter):
    new_table["d_quarter"+str(i)] = time_data["d"+quarters[i]]
    new_table["u_quarter"+str(i)] = time_data["u"+quarters[i]]
    new_table["l_quarter"+str(i)] = time_data["l"+quarters[i]]
    new_table["t_quarter"+str(i)] = time_data["t"+quarters[i]]

feature_table = pd.concat([feature_table, new_table], ignore_index=True)
print(feature_table.shape) # Watch the feature table become larger with each new quarter as target

(0, 0)
(581765, 53)
(1163530, 53)
(1745295, 53)
(2327060, 53)
(2908825, 53)
(3490590, 53)
(4072355, 53)
(4654120, 53)
(5235885, 53)


In [7]:
# Takes 90 seconds to save the csv file. over 1 GB file.
feature_table.to_csv("feature_table_by_quarter.csv", index=False)