In [31]:
import pandas as pd
import boto3
import json
import os
import gzip
import datetime
import pytz

In [18]:
s3 = boto3.client('s3')
bucket_name = 'colorado-catholic-buisness-directory'

In [19]:
# Get list of all CSV files in data folder
data_files = []
data_folder = '../data'  # Go up one directory level from notebooks/ to find data/
for file in os.listdir(data_folder):
    if file.endswith('.csv'):
        data_files.append(os.path.join(data_folder, file))

# Read and concatenate all CSV files
dfs = []
for file in data_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Combine all dataframes
if dfs:
    df = pd.concat(dfs, ignore_index=True)
    
    # Convert timestamp column to datetime
    #df['originalTimestamp'] = pd.to_datetime(df['originalTimestamp'])
    
    # Get the latest date
    latest_date = df['originalTimestamp'].max()
    print(f"Latest date in data: {latest_date}")
else:
    print("No CSV files found in data folder")


Latest date in data: 2025-08-08 19:07:52.544000+00:00


In [20]:
s3_folder = '/segment-logs/bdy6BRcrFwgvPGEtNFNwSA/'
files = []
file_dates = []
paginator = s3.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket_name, Prefix=s3_folder.lstrip('/'))

for page in pages:
    for obj in page.get('Contents', []):
        files.append(obj['Key'])
        file_dates.append(obj['LastModified'])

# Create a dictionary mapping filenames to their upload dates
file_info = dict(zip(files, file_dates))

In [38]:
def download_import_data(file_name, file_date):
    # Skip if file date is not newer than latest_date
    latest_date_formatted = datetime.datetime.strptime(latest_date.split()[0], '%Y-%m-%d')
    latest_date_formatted = latest_date_formatted.replace(tzinfo=pytz.UTC)
    
    if file_date <= latest_date_formatted:
        return pd.DataFrame()
        
    local_file_name = os.path.basename(file_name)
    with open(local_file_name, 'wb') as f:
        s3.download_fileobj(bucket_name, file_name, f)

    # Read the gzipped JSON file into a DataFrame
    # Segment typically stores data as newline-delimited JSON (NDJSON)
    df = pd.DataFrame()

    with gzip.open(local_file_name, 'rt') as f:
        # Read file line by line since each line is a separate JSON object
        data = [json.loads(line) for line in f]
        df = pd.DataFrame(data)

    # Expand the context column into separate columns
    context_df = pd.json_normalize(df['context'])
    # Drop the original context column and join with expanded columns
    df = df.drop('context', axis=1).join(context_df, rsuffix='_context')
    try:
        properties_df = pd.json_normalize(df['properties'])
        df = df.drop('properties', axis=1).join(properties_df, rsuffix='_properties')
    except:
        pass

    os.remove(local_file_name)

    return df

In [39]:
full_df = pd.DataFrame()

for file, file_date in file_info.items():
    df = download_import_data(file, file_date)
    full_df = pd.concat([full_df, df])

print(full_df.shape)

(125, 44)


In [None]:
# Fill missing values in 'event' column with values from 'type' column
full_df['event_type'] = full_df['event'].fillna(full_df['type'])

In [None]:
# Convert originalTimestamp to datetime
full_df['originalTimestamp'] = pd.to_datetime(full_df['originalTimestamp'])
full_df['originalTimestamp'].head()

0   2025-04-28 16:54:26.475000+00:00
0   2025-04-28 16:54:25.303000+00:00
0   2025-04-28 16:54:25.295000+00:00
0   2025-04-28 17:19:35.776000+00:00
0   2025-04-28 17:19:35.813000+00:00
Name: originalTimestamp, dtype: datetime64[ns, UTC]

In [None]:
# remove columns that are not useful for analysis
full_df.drop(columns=['projectId', '_metadata', 'sentAt', 'version', '__segment_internal', 'locale', 'timezone', 'ip', 
                      'library.version', 'timestamp', 'channel', 'integrations', 'event', 'type', 'library.name', 'receivedAt'], inplace=True)
full_df.head()

Unnamed: 0,anonymousId,messageId,userId,originalTimestamp,userAgent,page.path,page.referrer,page.search,page.title,page.url,...,address,other_category,description,display_email,business_name,main_category,phone,sub_category,website,event_type
0,e484246f-8684-4f92-ab30-1d093a3f5f60,ajs-next-1745859266475-8684ff92-2b30-4d09-ba3f...,,2025-04-28 16:54:26.475000+00:00,Mozilla/5.0 (Linux; Android 12; Pixel 6 Build/...,/,https://www.facebook.com/,?fbclid=IwZXh0bgNhZW0CMTEAAR58oxCxVDihSoLzwMP_...,Colorado Catholic Business Directory,https://coloradocatholicbusinessdirectory.com/...,...,,,,,,,,,,impression
0,e484246f-8684-4f92-ab30-1d093a3f5f60,ajs-next-1745859265303-246f8684-ff92-4b30-9d09...,,2025-04-28 16:54:25.303000+00:00,Mozilla/5.0 (Linux; Android 12; Pixel 6 Build/...,/,https://www.facebook.com/,?fbclid=IwZXh0bgNhZW0CMTEAAR58oxCxVDihSoLzwMP_...,Colorado Catholic Business Directory,https://coloradocatholicbusinessdirectory.com/...,...,,,,,,,,,,impression
0,e484246f-8684-4f92-ab30-1d093a3f5f60,ajs-next-1745859265295-13e48424-6f86-44ff-922b...,,2025-04-28 16:54:25.295000+00:00,Mozilla/5.0 (Linux; Android 12; Pixel 6 Build/...,/,https://www.facebook.com/,?fbclid=IwZXh0bgNhZW0CMTEAAR58oxCxVDihSoLzwMP_...,Colorado Catholic Business Directory,https://coloradocatholicbusinessdirectory.com/...,...,,,,,,,,,,page
0,39b9e148-9871-4dae-8fe9-b1f5845d3932,ajs-next-1745860775776-e2d11c37-72f4-449e-bbf8...,1743547621069x279645472884266780,2025-04-28 17:19:35.776000+00:00,Mozilla/5.0 (iPhone; CPU iPhone OS 18_4_1 like...,/,https://bit.ly/,,Colorado Catholic Business Directory,https://coloradocatholicbusinessdirectory.com/,...,,,,,,,,,,impression
0,39b9e148-9871-4dae-8fe9-b1f5845d3932,ajs-next-1745860775813-1c3772f4-d49e-4bf8-b69d...,1743547621069x279645472884266780,2025-04-28 17:19:35.813000+00:00,Mozilla/5.0 (iPhone; CPU iPhone OS 18_4_1 like...,/,https://bit.ly/,,Colorado Catholic Business Directory,https://coloradocatholicbusinessdirectory.com/,...,,,,,,,,,,impression


In [None]:
full_df.columns

Index(['anonymousId', 'messageId', 'userId', 'originalTimestamp', 'userAgent',
       'page.path', 'page.referrer', 'page.search', 'page.title', 'page.url',
       'userAgentData.brands', 'userAgentData.mobile',
       'userAgentData.platform', 'business_id', 'sponsored_listing',
       'category', 'name', 'path', 'referrer', 'search', 'title', 'url',
       'category_properties', 'name_properties', 'button_name', 'location',
       'city', 'search_text', 'parish', 'traits', 'input_num', 'input_name',
       'email', 'input_value', 'address', 'other_category', 'description',
       'display_email', 'business_name', 'main_category', 'phone',
       'sub_category', 'website', 'event_type'],
      dtype='object')

In [43]:
full_df.reset_index(drop=True, inplace=True)
print(full_df.shape)

# Get current datetime for filename
current_time = pd.Timestamp.now().strftime('%Y-%m-%d')
filename = f'../data/ccbd-{current_time}.csv'

full_df.to_csv(filename, index=False)

(125, 44)
