## 1. Extract Monthly Patterns


In [89]:
import sys
from pathlib import Path
import pyarrow.parquet as pq
import pyarrow.dataset as ds

sys.path.append(str(Path.cwd().parent))
from config import PATH_KIOSK_USER_PATTERNS_FOLDER, PATH_KIOSK_USER_PATTERNS_REPO, PATH_SSD_ADVAN_FOLDER

In [None]:
import pandas as pd

execfile(Path(PATH_KIOSK_USER_PATTERNS_REPO, "functions/safe_parse_json.py"))

cols = [
    "PLACEKEY", "LOCATION_NAME", "NAICS_CODE", "LATITUDE", "LONGITUDE",
    "STREET_ADDRESS", "CITY", "REGION", "DATE_RANGE_START", "DATE_RANGE_END",
    "RAW_VISITOR_COUNTS", "RAW_VISIT_COUNTS", "VISITOR_HOME_CBGS"
]
monthly_patterns_files = list((PATH_SSD_ADVAN_FOLDER / "Monthly Patterns" / "Foot Traffic").rglob('**/*.gz'))
f = monthly_patterns_files[0]
for f in monthly_patterns_files:
    # Read the gzip-compressed CSV file
    print(f"Processing file: {f}")
    df_table = pd.read_csv(f, compression='gzip', usecols=cols)  # 'f' is the path to the .gz file

    # Filter rows where LOCATION_NAME contains 'walmart'
    walmart_mask = df_table['LOCATION_NAME'].str.contains('walmart', case=False, na=False)
    df_walmart = df_table[walmart_mask]

    # Convert DATE_RANGE_START to datetime
    df_walmart['DATE_RANGE_START'] = pd.to_datetime(df_walmart['DATE_RANGE_START'])
    
    # Add columns for year and month
    df_walmart['YEAR'] = df_walmart['DATE_RANGE_START'].dt.year
    df_walmart['MONTH'] = df_walmart['DATE_RANGE_START'].dt.month
    df_walmart['FILE_NAME'] = f.name
    # Expand VISITOR_HOME_CBGS into long format
    df_cbgs = df_walmart.dropna(subset=['VISITOR_HOME_CBGS']).copy()
    df_cbgs = df_cbgs.assign(
        VISITOR_HOME_CBGS=df_cbgs['VISITOR_HOME_CBGS'].apply(lambda x: json.loads(x) if isinstance(x, str) else {})
    )

    # Apply safe_parse_json to VISITOR_HOME_CBGS
    df_cbgs = df_cbgs.assign(
    VISITOR_HOME_CBGS=df_cbgs['VISITOR_HOME_CBGS'].apply(safe_parse_json)
    )

    # First convert dictionaries to a format explode can handle
    df_cbgs = df_cbgs.assign(
    VISITOR_HOME_CBGS=df_cbgs['VISITOR_HOME_CBGS'].apply(
        lambda x: list(x.items()) if isinstance(x, dict) else None
    )
    )

    # Then explode
    df_long = df_cbgs.explode('VISITOR_HOME_CBGS')
    df_long = df_long.reset_index(drop=True)

    # Extract key and value from each pair
    df_long['HOME_CBG'] = df_long['VISITOR_HOME_CBGS'].apply(lambda x: x[0] if isinstance(x, tuple) else None)
    df_long['VISITOR_COUNT'] = df_long['VISITOR_HOME_CBGS'].apply(lambda x: x[1] if isinstance(x, tuple) else None)
    

    # Drop the original VISITOR_HOME_CBGS column
    df_long = df_long.drop(columns=['VISITOR_HOME_CBGS'])
    output_base = PATH_KIOSK_USER_PATTERNS_FOLDER / "working/processed/kupdat03_advan research monthly patterns"
    df_long.to_parquet(
        output_base,
        index=False,
        partition_cols=["YEAR", "MONTH"]
    )








Processing file: E:\Advan Research\Monthly Patterns\Foot Traffic\2023\data_01bd7440-0105-dcc9-0042-fa0702ed2712_13_3_27.csv.gz


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_walmart['DATE_RANGE_START'] = pd.to_datetime(df_walmart['DATE_RANGE_START'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_walmart['YEAR'] = df_walmart['DATE_RANGE_START'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_walmart['MONTH'] = df_walmart['DATE_RANGE_START'].dt.month
A v

In [None]:
# Convert dictionaries to lists of key-value pairs
df_cbgs['VISITOR_HOME_CBGS'] = df_cbgs['VISITOR_HOME_CBGS'].apply(
    lambda x: list(x.items()) if isinstance(x, dict) else None
)


In [96]:
dataset = ds.dataset(output_base, format="parquet")

df = next(dataset.to_batches(batch_size=10)).to_pandas()
df.head(n = 100)

Unnamed: 0,CITY,DATE_RANGE_END,DATE_RANGE_START,LATITUDE,LOCATION_NAME,LONGITUDE,NAICS_CODE,PLACEKEY,RAW_VISITOR_COUNTS,RAW_VISIT_COUNTS,REGION,STREET_ADDRESS,FILE_NAME,HOME_CBG,VISITOR_COUNT
0,Lakeville,2023-07-01 00:00:00.000,2023-06-01,44.650353,Walmart Pharmacy,-93.297135,446110,zzw-223@5pb-kgb-pn5,3679.0,6097.0,MN,20710 Keokuk Ave,data_01bd7440-0105-dcc9-0042-fa0702ed2712_13_3...,271390811004,197
1,Lakeville,2023-07-01 00:00:00.000,2023-06-01,44.650353,Walmart Pharmacy,-93.297135,446110,zzw-223@5pb-kgb-pn5,3679.0,6097.0,MN,20710 Keokuk Ave,data_01bd7440-0105-dcc9-0042-fa0702ed2712_13_3...,270370608203,132
2,Lakeville,2023-07-01 00:00:00.000,2023-06-01,44.650353,Walmart Pharmacy,-93.297135,446110,zzw-223@5pb-kgb-pn5,3679.0,6097.0,MN,20710 Keokuk Ave,data_01bd7440-0105-dcc9-0042-fa0702ed2712_13_3...,271390810001,104
3,Lakeville,2023-07-01 00:00:00.000,2023-06-01,44.650353,Walmart Pharmacy,-93.297135,446110,zzw-223@5pb-kgb-pn5,3679.0,6097.0,MN,20710 Keokuk Ave,data_01bd7440-0105-dcc9-0042-fa0702ed2712_13_3...,270370608192,66
4,Lakeville,2023-07-01 00:00:00.000,2023-06-01,44.650353,Walmart Pharmacy,-93.297135,446110,zzw-223@5pb-kgb-pn5,3679.0,6097.0,MN,20710 Keokuk Ave,data_01bd7440-0105-dcc9-0042-fa0702ed2712_13_3...,271310701003,66
5,Lakeville,2023-07-01 00:00:00.000,2023-06-01,44.650353,Walmart Pharmacy,-93.297135,446110,zzw-223@5pb-kgb-pn5,3679.0,6097.0,MN,20710 Keokuk Ave,data_01bd7440-0105-dcc9-0042-fa0702ed2712_13_3...,270370608202,66
6,Lakeville,2023-07-01 00:00:00.000,2023-06-01,44.650353,Walmart Pharmacy,-93.297135,446110,zzw-223@5pb-kgb-pn5,3679.0,6097.0,MN,20710 Keokuk Ave,data_01bd7440-0105-dcc9-0042-fa0702ed2712_13_3...,270370608201,66
7,Lakeville,2023-07-01 00:00:00.000,2023-06-01,44.650353,Walmart Pharmacy,-93.297135,446110,zzw-223@5pb-kgb-pn5,3679.0,6097.0,MN,20710 Keokuk Ave,data_01bd7440-0105-dcc9-0042-fa0702ed2712_13_3...,270370608152,57
8,Lakeville,2023-07-01 00:00:00.000,2023-06-01,44.650353,Walmart Pharmacy,-93.297135,446110,zzw-223@5pb-kgb-pn5,3679.0,6097.0,MN,20710 Keokuk Ave,data_01bd7440-0105-dcc9-0042-fa0702ed2712_13_3...,271390811003,57
9,Lakeville,2023-07-01 00:00:00.000,2023-06-01,44.650353,Walmart Pharmacy,-93.297135,446110,zzw-223@5pb-kgb-pn5,3679.0,6097.0,MN,20710 Keokuk Ave,data_01bd7440-0105-dcc9-0042-fa0702ed2712_13_3...,270370608213,57


Unnamed: 0,CITY,DATE_RANGE_END,DATE_RANGE_START,LATITUDE,LOCATION_NAME,LONGITUDE,NAICS_CODE,PLACEKEY,RAW_VISITOR_COUNTS,RAW_VISIT_COUNTS,REGION,STREET_ADDRESS,VISITOR_HOME_CBGS
9725,San Antonio,2023-07-01 00:00:00.000,2023-06-01 00:00:00.000,29.357773,VIA Metropolitan Transit QUANTUM LOOP AT WALMA...,-98.633198,485113,zzy-222@8sz-tww-p7q,,,TX,Quantum Loop,
14798,Sept-Iles,2023-07-01 00:00:00.000,2023-06-01 00:00:00.000,50.227223,Walmart Photo Center,-66.394663,812921,zzw-222@3hj-hfh-f2k,,,QC,1005 Boul Laure Unit 500,
20154,Minneapolis,2023-07-01 00:00:00.000,2023-06-01 00:00:00.000,45.099793,Metro Transit Walmart & Main Entrances,-93.387754,485113,zzy-222@5s8-cgk-hh5,,,MN,Walmart @ Main,
28412,Amherst,2023-07-01 00:00:00.000,2023-06-01 00:00:00.000,45.816499,Walmart Pharmacy,-64.198816,446110,222-227@64m-nym-8sq,,,NS,46 Robert Angus Dr,
28932,Guilford,2023-07-01 00:00:00.000,2023-06-01 00:00:00.000,41.287194,Walmart,-72.678876,452311,zzw-222@629-cyv-5xq,,,CT,900 Boston Post Rd,


## 2. Extract Home Panel Summary