In [9]:
import os
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
import matplotlib.pyplot as plt
from glob import glob

In [3]:
hourly_data_dir = r'D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\hourly_results\removed_negative'

save_dir = r'D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\hourly_results\removed_negative\split_population'
os.makedirs(save_dir, exist_ok=True)


In [12]:
all_files = glob(os.path.join(hourly_data_dir, 'CBG_population_hourly_2022*.csv'))
all_files.sort()
all_files

['D:\\OneDrive_Emory\\OneDrive - Emory\\Research_doc\\hourly_population\\hourly_results\\removed_negative\\CBG_population_hourly_202201.csv',
 'D:\\OneDrive_Emory\\OneDrive - Emory\\Research_doc\\hourly_population\\hourly_results\\removed_negative\\CBG_population_hourly_202202.csv',
 'D:\\OneDrive_Emory\\OneDrive - Emory\\Research_doc\\hourly_population\\hourly_results\\removed_negative\\CBG_population_hourly_202203.csv',
 'D:\\OneDrive_Emory\\OneDrive - Emory\\Research_doc\\hourly_population\\hourly_results\\removed_negative\\CBG_population_hourly_202204.csv',
 'D:\\OneDrive_Emory\\OneDrive - Emory\\Research_doc\\hourly_population\\hourly_results\\removed_negative\\CBG_population_hourly_202205.csv',
 'D:\\OneDrive_Emory\\OneDrive - Emory\\Research_doc\\hourly_population\\hourly_results\\removed_negative\\CBG_population_hourly_202206.csv',
 'D:\\OneDrive_Emory\\OneDrive - Emory\\Research_doc\\hourly_population\\hourly_results\\removed_negative\\CBG_population_hourly_202207.csv',
 'D:\\

In [33]:
# split files by county fips, save to separate compressed csv files, without column index
for file in tqdm(all_files, total=len(all_files), desc='Processing files'):
    year_str = os.path.basename(file).replace('.csv', '')[-6:-2]
    month_str = os.path.basename(file).replace('.csv', '')[-2:]
    print(f'Processing  {year_str}-{month_str} file: {file}')
    month_dir = os.path.join(save_dir, year_str, month_str)
    os.makedirs(month_dir, exist_ok=True)
    print("Loading data...")
    df = pd.read_csv(file, dtype={'CBG': str})
    df['CBG'] = df['CBG'].str.zfill(12)
    df['county_fips'] = df['CBG'].str[:5]
    county_fips_list = df['county_fips'].unique().tolist()
    print(f'Splitting data into {len(county_fips_list)} counties...')
    for county_fips in tqdm(county_fips_list, total=len(county_fips_list)):
        df_county = df[df['county_fips'] == county_fips].drop(columns=['county_fips'])
        save_path = os.path.join(month_dir, f'{county_fips}.csv.gz')
        df_county.to_csv(save_path,  index=False, compression='gzip')

Processing files:   0%|          | 0/12 [00:00<?, ?it/s]

Processing  2022-01 file: D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\hourly_results\removed_negative\CBG_population_hourly_202201.csv
Loading data...
Splitting data into 3142 counties...


100%|██████████| 3142/3142 [03:05<00:00, 16.95it/s]
Processing files:   8%|▊         | 1/12 [03:15<35:52, 195.66s/it]

Processing  2022-02 file: D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\hourly_results\removed_negative\CBG_population_hourly_202202.csv
Loading data...
Splitting data into 3141 counties...


100%|██████████| 3141/3141 [02:42<00:00, 19.30it/s]
Processing files:  17%|█▋        | 2/12 [06:06<30:10, 181.05s/it]

Processing  2022-03 file: D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\hourly_results\removed_negative\CBG_population_hourly_202203.csv
Loading data...
Splitting data into 3141 counties...


100%|██████████| 3141/3141 [03:00<00:00, 17.42it/s]
Processing files:  25%|██▌       | 3/12 [09:15<27:43, 184.82s/it]

Processing  2022-04 file: D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\hourly_results\removed_negative\CBG_population_hourly_202204.csv
Loading data...
Splitting data into 3142 counties...


100%|██████████| 3142/3142 [02:49<00:00, 18.49it/s]
Processing files:  33%|███▎      | 4/12 [12:14<24:19, 182.45s/it]

Processing  2022-05 file: D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\hourly_results\removed_negative\CBG_population_hourly_202205.csv
Loading data...
Splitting data into 3142 counties...


100%|██████████| 3142/3142 [02:53<00:00, 18.09it/s]
Processing files:  42%|████▏     | 5/12 [15:17<21:17, 182.57s/it]

Processing  2022-06 file: D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\hourly_results\removed_negative\CBG_population_hourly_202206.csv
Loading data...
Splitting data into 3142 counties...


100%|██████████| 3142/3142 [03:02<00:00, 17.25it/s]
Processing files:  50%|█████     | 6/12 [18:28<18:33, 185.57s/it]

Processing  2022-07 file: D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\hourly_results\removed_negative\CBG_population_hourly_202207.csv
Loading data...
Splitting data into 3129 counties...


100%|██████████| 3129/3129 [03:05<00:00, 16.86it/s]
Processing files:  58%|█████▊    | 7/12 [21:43<15:43, 188.72s/it]

Processing  2022-08 file: D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\hourly_results\removed_negative\CBG_population_hourly_202208.csv
Loading data...
Splitting data into 3142 counties...


100%|██████████| 3142/3142 [03:08<00:00, 16.69it/s]
Processing files:  67%|██████▋   | 8/12 [25:04<12:49, 192.33s/it]

Processing  2022-09 file: D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\hourly_results\removed_negative\CBG_population_hourly_202209.csv
Loading data...
Splitting data into 3142 counties...


100%|██████████| 3142/3142 [02:47<00:00, 18.76it/s]
Processing files:  75%|███████▌  | 9/12 [28:00<09:21, 187.31s/it]

Processing  2022-10 file: D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\hourly_results\removed_negative\CBG_population_hourly_202210.csv
Loading data...
Splitting data into 3142 counties...


100%|██████████| 3142/3142 [02:50<00:00, 18.40it/s]
Processing files:  83%|████████▎ | 10/12 [31:00<06:09, 184.99s/it]

Processing  2022-11 file: D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\hourly_results\removed_negative\CBG_population_hourly_202211.csv
Loading data...
Splitting data into 3142 counties...


100%|██████████| 3142/3142 [02:47<00:00, 18.80it/s]
Processing files:  92%|█████████▏| 11/12 [33:56<03:02, 182.25s/it]

Processing  2022-12 file: D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\hourly_results\removed_negative\CBG_population_hourly_202212.csv
Loading data...
Splitting data into 3142 counties...


100%|██████████| 3142/3142 [02:51<00:00, 18.29it/s]
Processing files: 100%|██████████| 12/12 [36:57<00:00, 184.76s/it]


In [35]:
pd.read_csv(save_path ).head()

Unnamed: 0,CBG,2022-12-01 00:00:00,2022-12-01 01:00:00,2022-12-01 02:00:00,2022-12-01 03:00:00,2022-12-01 04:00:00,2022-12-01 05:00:00,2022-12-01 06:00:00,2022-12-01 07:00:00,2022-12-01 08:00:00,...,2022-12-31 14:00:00,2022-12-31 15:00:00,2022-12-31 16:00:00,2022-12-31 17:00:00,2022-12-31 18:00:00,2022-12-31 19:00:00,2022-12-31 20:00:00,2022-12-31 21:00:00,2022-12-31 22:00:00,2022-12-31 23:00:00
0,560459511001,1161,1312,1440,1471,1755,1781,1271,1259,1778,...,910,1324,1633,1249,1154,1215,1026,831,848,1023
1,560459511002,1944,1861,1850,1848,1819,1635,1560,599,515,...,1464,1680,2401,2112,1135,966,1162,1161,1251,1767
2,560459513001,1086,1172,1231,1329,1235,1268,1552,497,637,...,793,1020,540,568,557,815,659,693,1181,1075
3,560459513002,1172,1124,1208,1188,1211,1328,1747,2795,2981,...,1854,1702,1575,1673,1211,1361,1595,1090,1642,965
4,560459513003,999,1246,1190,1251,1233,1223,1108,914,742,...,506,499,137,555,778,692,932,1206,1040,1213
