In [37]:
import pandas as pd
import boto3
import json
import os
import gzip

In [38]:
s3 = boto3.client('s3')
bucket_name = 'colorado-catholic-buisness-directory'

In [39]:
s3_folder = '/segment-logs/bdy6BRcrFwgvPGEtNFNwSA/'
files = []
paginator = s3.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket_name, Prefix=s3_folder.lstrip('/'))

for page in pages:
    files.extend([obj['Key'] for obj in page.get('Contents', [])])
print(files)

['segment-logs/bdy6BRcrFwgvPGEtNFNwSA/1745798400000/1745862317770.events-archiver-480-20..a060ad2f-9402-46a0-a8b0-fd49782e98af.gz', 'segment-logs/bdy6BRcrFwgvPGEtNFNwSA/1745798400000/1745862406014.events-archiver-480-30..fa34ca49-e972-4470-8e46-368465ea3d7a.gz', 'segment-logs/bdy6BRcrFwgvPGEtNFNwSA/1745798400000/1745862864557.events-archiver-480-11..6b532479-d867-4f33-80c7-0c7c4126401f.gz', 'segment-logs/bdy6BRcrFwgvPGEtNFNwSA/1745798400000/1745863916581.events-archiver-480-13..79268d46-fc8c-4920-8326-0897d9da6c4f.gz', 'segment-logs/bdy6BRcrFwgvPGEtNFNwSA/1745798400000/1745863966865.events-archiver-480-5..a8a0b38a-1622-4619-a709-c7146fc32791.gz', 'segment-logs/bdy6BRcrFwgvPGEtNFNwSA/1745798400000/1745864254353.events-archiver-480-9..6827bc38-d8d3-4d8b-9039-cf9b54ea3a55.gz', 'segment-logs/bdy6BRcrFwgvPGEtNFNwSA/1745884800000/1745891771908.events-archiver-480-38..0d0a6ce2-92ab-4f9e-b1c2-b30f4185875d.gz', 'segment-logs/bdy6BRcrFwgvPGEtNFNwSA/1745884800000/1745891831936.events-archiver-480

In [40]:
len(files)

5568

In [41]:
def download_import_data(file_name):
    local_file_name = os.path.basename(file_name)
    with open(local_file_name, 'wb') as f:
        s3.download_fileobj(bucket_name, file_name, f)

    # Read the gzipped JSON file into a DataFrame
    # Segment typically stores data as newline-delimited JSON (NDJSON)
    df = pd.DataFrame()

    with gzip.open(local_file_name, 'rt') as f:
        # Read file line by line since each line is a separate JSON object
        data = [json.loads(line) for line in f]
        df = pd.DataFrame(data)

    # Expand the context column into separate columns
    context_df = pd.json_normalize(df['context'])
    # Drop the original context column and join with expanded columns
    df = df.drop('context', axis=1).join(context_df, rsuffix='_context')
    try:
        properties_df = pd.json_normalize(df['properties'])
        df = df.drop('properties', axis=1).join(properties_df, rsuffix='_properties')
    except:
        pass

    os.remove(local_file_name)

    return df

In [42]:
full_df = pd.DataFrame()

for file in files:
    df = download_import_data(file)
    full_df = pd.concat([full_df, df])

print(full_df.shape)

(9771, 59)


In [43]:
# Fill missing values in 'event' column with values from 'type' column
full_df['event_type'] = full_df['event'].fillna(full_df['type'])

In [44]:
# Convert originalTimestamp to datetime
full_df['originalTimestamp'] = pd.to_datetime(full_df['originalTimestamp'])
full_df['originalTimestamp'].head()

0   2025-04-28 16:54:26.475000+00:00
0   2025-04-28 16:54:25.303000+00:00
0   2025-04-28 16:54:25.295000+00:00
0   2025-04-28 17:19:35.776000+00:00
0   2025-04-28 17:19:35.813000+00:00
Name: originalTimestamp, dtype: datetime64[ns, UTC]

In [45]:
# remove columns that are not useful for analysis
full_df.drop(columns=['projectId', '_metadata', 'sentAt', 'version', '__segment_internal', 'locale', 'timezone', 'ip', 
                      'library.version', 'timestamp', 'channel', 'integrations', 'event', 'type', 'library.name', 'receivedAt'], inplace=True)
full_df.head()

Unnamed: 0,anonymousId,messageId,userId,originalTimestamp,userAgent,page.path,page.referrer,page.search,page.title,page.url,...,address,other_category,description,display_email,business_name,main_category,phone,sub_category,website,event_type
0,e484246f-8684-4f92-ab30-1d093a3f5f60,ajs-next-1745859266475-8684ff92-2b30-4d09-ba3f...,,2025-04-28 16:54:26.475000+00:00,Mozilla/5.0 (Linux; Android 12; Pixel 6 Build/...,/,https://www.facebook.com/,?fbclid=IwZXh0bgNhZW0CMTEAAR58oxCxVDihSoLzwMP_...,Colorado Catholic Business Directory,https://coloradocatholicbusinessdirectory.com/...,...,,,,,,,,,,impression
0,e484246f-8684-4f92-ab30-1d093a3f5f60,ajs-next-1745859265303-246f8684-ff92-4b30-9d09...,,2025-04-28 16:54:25.303000+00:00,Mozilla/5.0 (Linux; Android 12; Pixel 6 Build/...,/,https://www.facebook.com/,?fbclid=IwZXh0bgNhZW0CMTEAAR58oxCxVDihSoLzwMP_...,Colorado Catholic Business Directory,https://coloradocatholicbusinessdirectory.com/...,...,,,,,,,,,,impression
0,e484246f-8684-4f92-ab30-1d093a3f5f60,ajs-next-1745859265295-13e48424-6f86-44ff-922b...,,2025-04-28 16:54:25.295000+00:00,Mozilla/5.0 (Linux; Android 12; Pixel 6 Build/...,/,https://www.facebook.com/,?fbclid=IwZXh0bgNhZW0CMTEAAR58oxCxVDihSoLzwMP_...,Colorado Catholic Business Directory,https://coloradocatholicbusinessdirectory.com/...,...,,,,,,,,,,page
0,39b9e148-9871-4dae-8fe9-b1f5845d3932,ajs-next-1745860775776-e2d11c37-72f4-449e-bbf8...,1743547621069x279645472884266780,2025-04-28 17:19:35.776000+00:00,Mozilla/5.0 (iPhone; CPU iPhone OS 18_4_1 like...,/,https://bit.ly/,,Colorado Catholic Business Directory,https://coloradocatholicbusinessdirectory.com/,...,,,,,,,,,,impression
0,39b9e148-9871-4dae-8fe9-b1f5845d3932,ajs-next-1745860775813-1c3772f4-d49e-4bf8-b69d...,1743547621069x279645472884266780,2025-04-28 17:19:35.813000+00:00,Mozilla/5.0 (iPhone; CPU iPhone OS 18_4_1 like...,/,https://bit.ly/,,Colorado Catholic Business Directory,https://coloradocatholicbusinessdirectory.com/,...,,,,,,,,,,impression


In [46]:
full_df.columns

Index(['anonymousId', 'messageId', 'userId', 'originalTimestamp', 'userAgent',
       'page.path', 'page.referrer', 'page.search', 'page.title', 'page.url',
       'userAgentData.brands', 'userAgentData.mobile',
       'userAgentData.platform', 'business_id', 'sponsored_listing',
       'category', 'name', 'path', 'referrer', 'search', 'title', 'url',
       'category_properties', 'name_properties', 'button_name', 'location',
       'city', 'search_text', 'parish', 'traits', 'input_num', 'input_name',
       'email', 'input_value', 'address', 'other_category', 'description',
       'display_email', 'business_name', 'main_category', 'phone',
       'sub_category', 'website', 'event_type'],
      dtype='object')

In [47]:
full_df.reset_index(drop=True, inplace=True)
print(full_df.shape)
full_df.to_csv('ccbd-data.csv', index=False)

(9771, 44)
