In [1]:
import os
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
import matplotlib.pyplot as plt
from glob import glob

In [2]:
hourly_data_dir = r'D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\hourly_results\removed_negative'

save_dir = r'D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\github\split_population'
os.makedirs(save_dir, exist_ok=True)


In [3]:
all_files = glob(os.path.join(hourly_data_dir, 'CBG_population_hourly_2022*.csv'))
all_files.sort()
all_files

['D:\\OneDrive_Emory\\OneDrive - Emory\\Research_doc\\hourly_population\\hourly_results\\removed_negative\\CBG_population_hourly_202201.csv',
 'D:\\OneDrive_Emory\\OneDrive - Emory\\Research_doc\\hourly_population\\hourly_results\\removed_negative\\CBG_population_hourly_202202.csv',
 'D:\\OneDrive_Emory\\OneDrive - Emory\\Research_doc\\hourly_population\\hourly_results\\removed_negative\\CBG_population_hourly_202203.csv',
 'D:\\OneDrive_Emory\\OneDrive - Emory\\Research_doc\\hourly_population\\hourly_results\\removed_negative\\CBG_population_hourly_202204.csv',
 'D:\\OneDrive_Emory\\OneDrive - Emory\\Research_doc\\hourly_population\\hourly_results\\removed_negative\\CBG_population_hourly_202205.csv',
 'D:\\OneDrive_Emory\\OneDrive - Emory\\Research_doc\\hourly_population\\hourly_results\\removed_negative\\CBG_population_hourly_202206.csv',
 'D:\\OneDrive_Emory\\OneDrive - Emory\\Research_doc\\hourly_population\\hourly_results\\removed_negative\\CBG_population_hourly_202207.csv',
 'D:\\

In [6]:
# split files by county fips, save to separate compressed csv files, without column index
for file in tqdm(all_files[1:2], total=len(all_files), desc='Processing files'):
    year_str = os.path.basename(file).replace('.csv', '')[-6:-2]
    month_str = os.path.basename(file).replace('.csv', '')[-2:]
    print(f'Processing  {year_str}-{month_str} file: {file}')
    month_dir = os.path.join(save_dir, year_str, month_str)
    os.makedirs(month_dir, exist_ok=True)
    print("Loading data...")
    df = pd.read_csv(file, dtype={'CBG': str})
    df['CBG'] = df['CBG'].str.zfill(12)
    df['county_fips'] = df['CBG'].str[:5]
    county_fips_list = df['county_fips'].unique().tolist()
    print(f'Splitting data into {len(county_fips_list)} counties...')
    for county_fips in tqdm(county_fips_list, total=len(county_fips_list)):
        df_county = df[df['county_fips'] == county_fips].drop(columns=['county_fips'])
        save_path = os.path.join(month_dir, f'{county_fips}.csv.gz')
        df_county.to_csv(save_path,  index=False, compression='gzip')

Processing files:   0%|          | 0/12 [00:00<?, ?it/s]

Processing  2022-02 file: D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\hourly_results\removed_negative\CBG_population_hourly_202202.csv
Loading data...
Splitting data into 3141 counties...


100%|██████████| 3141/3141 [02:53<00:00, 18.08it/s]
Processing files:   8%|▊         | 1/12 [03:02<33:23, 182.11s/it]


In [35]:
pd.read_csv(save_path ).head()

Unnamed: 0,CBG,2022-12-01 00:00:00,2022-12-01 01:00:00,2022-12-01 02:00:00,2022-12-01 03:00:00,2022-12-01 04:00:00,2022-12-01 05:00:00,2022-12-01 06:00:00,2022-12-01 07:00:00,2022-12-01 08:00:00,...,2022-12-31 14:00:00,2022-12-31 15:00:00,2022-12-31 16:00:00,2022-12-31 17:00:00,2022-12-31 18:00:00,2022-12-31 19:00:00,2022-12-31 20:00:00,2022-12-31 21:00:00,2022-12-31 22:00:00,2022-12-31 23:00:00
0,560459511001,1161,1312,1440,1471,1755,1781,1271,1259,1778,...,910,1324,1633,1249,1154,1215,1026,831,848,1023
1,560459511002,1944,1861,1850,1848,1819,1635,1560,599,515,...,1464,1680,2401,2112,1135,966,1162,1161,1251,1767
2,560459513001,1086,1172,1231,1329,1235,1268,1552,497,637,...,793,1020,540,568,557,815,659,693,1181,1075
3,560459513002,1172,1124,1208,1188,1211,1328,1747,2795,2981,...,1854,1702,1575,1673,1211,1361,1595,1090,1642,965
4,560459513003,999,1246,1190,1251,1233,1223,1108,914,742,...,506,499,137,555,778,692,932,1206,1040,1213


# Upload to huggingface

In [8]:
# ! pip install huggingface_hub 

In [1]:
import pandas as pd

pd.read_csv(r"https://huggingface.co/datasets/gladcolor/hourly_population_US2022/resolve/main/2022/01/01001.csv.gz").head()

Unnamed: 0,CBG,2022-01-01 00:00:00,2022-01-01 01:00:00,2022-01-01 02:00:00,2022-01-01 03:00:00,2022-01-01 04:00:00,2022-01-01 05:00:00,2022-01-01 06:00:00,2022-01-01 07:00:00,2022-01-01 08:00:00,...,2022-01-31 14:00:00,2022-01-31 15:00:00,2022-01-31 16:00:00,2022-01-31 17:00:00,2022-01-31 18:00:00,2022-01-31 19:00:00,2022-01-31 20:00:00,2022-01-31 21:00:00,2022-01-31 22:00:00,2022-01-31 23:00:00
0,10010201001,832,437,517,592,648,675,641,677,590,...,148,453,73,86,64,31,240,293,417,564
1,10010201002,2197,1840,2120,2063,2205,2199,1742,1544,1189,...,490,917,1275,2059,3202,4031,4187,3611,3504,2842
2,10010202001,983,615,684,750,768,822,758,767,748,...,684,1944,600,156,160,148,416,411,571,729
3,10010202002,1417,1115,1123,1003,981,1139,1041,1196,1403,...,2072,2876,2385,2375,1280,959,987,814,864,866
4,10010203001,2927,2979,3243,3676,3151,2916,2870,2811,2691,...,1195,2101,3080,4673,5802,4996,4824,3784,3644,3367
