Convert the .csv.gz files into parquets.

In [None]:
import os
import duckdb
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
import random

# 1. Parpuet processing

In [None]:
parquet_dir = r'D:\Data\Advan\dewey-downloads\neighborhood-patterns_parquets'
# parquet_dir = r'D:\Data\Advan\dewey-downloads\Neighborhood-Patterns-Plus'

con = duckdb.connect()

# set multiprocessing
con.execute("PRAGMA threads=8;")

# create a view for all parquet files
con.execute(f"""
    CREATE OR REPLACE VIEW neighborhood_patterns AS
    SELECT *
    FROM parquet_scan('{parquet_dir}/*.parquet')
""")

# get the total row count
total_row_cnt = con.execute("SELECT COUNT(*) FROM neighborhood_patterns").fetchall()[0][0]


 

In [None]:
# select  100 rows to test
select_cnt = 200
offset = random.randint(0, total_row_cnt - select_cnt)
con.execute(f"""SELECT * FROM neighborhood_patterns LIMIT {select_cnt} OFFSET {offset};""")
df = con.fetchdf()  
df

In [None]:
df[['POPULARITY_BY_EACH_HOUR', 'STOPS_BY_EACH_HOUR', 'MEDIAN_DWELL', 'AREA', "DEVICE_HOME_AREAS", "WEEKDAY_DEVICE_HOME_AREAS", 'WEEKEND_DEVICE_HOME_AREAS']].sample(10)

In [None]:
import geopandas as gpd

data_dir = r'D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\data'    
ACS_file = os.path.join(data_dir, "cbg_acs_2019_county_tract_new20230929_cleaned.csv")
home_panel_fname = os.path.join("..", "home_panel_summary_2019_2023_restated_20240119.csv.gz")  ## large change!!!
# see issues: https://community.deweydata.io/t/residing-device-count-in-2023-07-and-2023-08-surged-abnomorally/26675/15

CBG_2019_fname = os.path.join(data_dir, "blockgroups2019.zip") 
 

CBG2019_gdf = gpd.read_file(CBG_2019_fname)
CBG2019_gdf['County_FIPS'] = CBG2019_gdf['GEOID'].astype(str).str.zfill(12).str[:5]
CBG2019_gdf['CBG'] = CBG2019_gdf['GEOID'].astype(str).str.zfill(12) 
CBG2019_gdf

In [None]:
ACS_df = pd.read_csv(os.path.join(data_dir, ACS_file) )
ACS_df['fips'] = ACS_df['fips'].astype(str).str.zfill(12)
ACS_df

In [None]:
# plot the POPULARITY_BY_EACH_HOUR, STOPS_BY_EACH_HOUR for each row
import matplotlib.pyplot as plt
import json
fig, ax = plt.subplots(figsize=(20, 5))

a_select_row = df.sample(1)
area = a_select_row['AREA'].iloc[0]
print(area)

col = 'POPULARITY_BY_EACH_HOUR'
stops = json.loads(a_select_row[col].iloc[0].replace("\"", ""))
ax.plot(stops, label=col)

col = 'STOPS_BY_EACH_HOUR'
stops = json.loads(a_select_row[col].iloc[0].replace("\"", ""))
ax.plot(stops, label=col)
ax.legend()
ax.set_title(area)
population = ACS_df[ACS_df['fips'] == area]['totalpopulation'].values[0]
print(f"Total population in {area}: {population}")

In [None]:
CBG2019_gdf[CBG2019_gdf['CBG'] == str(area).zfill(12)].explore()

In [None]:
a_select_row[['POPULARITY_BY_HOUR_MONDAY', 'POPULARITY_BY_HOUR_FRIDAY', 'POPULARITY_BY_HOUR_SATURDAY']].sample(1)

In [None]:
a_select_row[['WEEKDAY_DEVICE_HOME_AREAS', 'WEEKEND_DEVICE_HOME_AREAS', 'DEVICE_HOME_AREAS', 'LUNCH_DEVICE_HOME_AREAS']].sample(1)

In [None]:
a_select_row.T

In [None]:
STOP

# 0. CSV to Parpuet

In [None]:
import os
import duckdb
from glob import glob
import pandas as pd

In [None]:
csv_dir = r'D:\Data\Advan\dewey-downloads\neighborhood-patterns'
# file name format: 2018-01--data_01bd9f51-0105-f61f-0042-fa0702f754ce_304_7_0.csv.gz
save_dir = r'D:\Data\Advan\dewey-downloads\neighborhood-patterns_parquets'
os.makedirs(save_dir, exist_ok=True)

# process 2022 year files only for now
months = [f"{i:02d}" for i in range(1, 13)]
year = "2022"

for month in months:
    print(f"Processing {year}-{month}...")
    csv_files = glob(os.path.join(csv_dir, f"{year}-{month}--data_*.csv.gz"))
    df_list = []
    print(f"  Converting {len(csv_files)} files to parquet...")

    for csv_file in csv_files:        
        df = pd.read_csv(csv_file)
        df_list.append(df)

    df_all = pd.concat(df_list, ignore_index=True)
    parquet_file = os.path.join(save_dir, f"{year}-{month}.parquet")
    df_all.to_parquet(parquet_file, index=False)
        
    print(f"Completed processing for {year}-{month}.\n")


# Merge home panel CSV files

In [None]:
import os
import pandas as pd
from glob import glob

In [None]:
data_dir = r'D:\Data\Advan\dewey-downloads\neighborhood-patterns-us-home-panel-summary'
save_fname = r'D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\data\neighborhood-patterns-us-home-panel-summary_all.csv.gz'
# file name format: neighborhood-patterns-us-home-panel-summary_0_2_0.csv.gz, 60 file in total, July. 2015
csv_files = glob(os.path.join(data_dir, f"neighborhood-patterns-us-home-panel-summary*.csv.gz"))
print("Total CSV files found:", len(csv_files))

df_list = []
for csv_file in csv_files:
    print("Processing:", csv_file)
    df = pd.read_csv(csv_file)
    df_list.append(df)
df_all = pd.concat(df_list, ignore_index=True)
df_all.to_csv(save_fname, index=False, compression='gzip')

In [None]:
df2022 = df_all.query("YEAR == 2022 and ISO_COUNTRY_CODE == 'US'").copy()
df2022

In [None]:
df2022 = df2022.dropna(subset=['CENSUS_BLOCK_GROUP', 'NUMBER_DEVICES_RESIDING', 'NUMBER_DEVICES_PRIMARY_DAYTIME'], inplace=False).copy()
df2022['NUMBER_DEVICES_RESIDING'] = df2022['NUMBER_DEVICES_RESIDING'].astype(int).copy()
df2022['NUMBER_DEVICES_PRIMARY_DAYTIME'] = df2022['NUMBER_DEVICES_PRIMARY_DAYTIME'].astype(int).copy()

df2022 = df2022.sort_values(by=["YEAR", "MONTH", 'CENSUS_BLOCK_GROUP'], ascending=True) 
df2022

In [None]:
df2022.sample(10)

In [None]:
# save year 2022 only to CSV 
save_fname_2022 = r'D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\data\neighborhood-patterns-us-home-panel-summary_2022.csv'
df2022.to_csv(save_fname_2022, index=False)

In [None]:
# save year 2022 only to CSV 
save_fname_2022 = r'D:\OneDrive_Emory\OneDrive - Emory\Research_doc\hourly_population\data\neighborhood-patterns-us-home-panel-summary_2022.csv.gz'
df2022.to_csv(save_fname_2022, index=False, compression='gzip')