In [1]:
%pip install pyarrow
%pip install fastparquet
%pip install awswrangler
%pip install s3fs


Note: you may need to restart the kernel to use updated packages.
Collecting fastparquet
  Using cached fastparquet-2024.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Using cached cramjam-2.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Using cached fastparquet-2024.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
Using cached cramjam-2.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.8.3 fastparquet-2024.5.0
Note: you may need to restart the kernel to use updated packages.
Collecting awswrangler
  Using cached awswrangler-3.8.0-py3-none-any.whl.metadata (17 kB)
Using cached awswrangler-3.8.0-py3-none-any.whl (380 kB)
Installing collected packages: awswrangler
Successfully installed awswrangler-3.8.0
Note: you may need to restart the kernel to u

In [2]:
import s3fs
import pandas as pd

In [3]:
def read_from_s3(path):
    """Read parquet files and combine them into a single dataframe"""
    fs = s3fs.core.S3FileSystem()
    all_paths_from_s3 = fs.glob(path=f"{path}*.parquet")

    if len(all_paths_from_s3) > 0:
        s3 = s3fs.S3FileSystem()
        fp_obj = ParquetFile(
            all_paths_from_s3, open_with=s3.open
        )  # use s3fs as the filesystem
        data = fp_obj.to_pandas()
        return data
    elif len(all_paths_from_s3)==1:
        return pd.read_parquet(all_paths_from_s3[0])
    else:
        print(f"Nothing found")
        print(f"paths from a{all_paths_from_s3}")
    
def read_csv_s3(bucket, key):
    try:
        s3 = boto3.client('s3')
        obj = s3.get_object(Bucket=bucket, Key=key)
        df = pd.read_csv(obj['Body'])
        return df
    except ClientError as ex:
        if ex.response['Error']['Code'] == 'NoSuchKey':
            print("Key doesn't match. Please check the key value entered.")

def list_s3_flies(base_path):
    fs = s3fs.core.S3FileSystem()
    all_paths_from_s3 = fs.glob(path=f"{base_path}*.parquet")
    return all_paths_from_s3


In [16]:
start_date_s = '2024-05-14'
end_date_s = '2024-05-16'
date_range = pd.date_range(start=start_date_s, end=end_date_s)
# Convert the date range to a list of strings
date_list = date_range.strftime('%Y-%m-%d').tolist()
date_list

['2024-05-14', '2024-05-15', '2024-05-16']

In [19]:
max_rows_per_type = 10000
df_total = pd.DataFrame()
for one_date in date_list:
    one_date_s = str(one_date)
    path_file =  f"s3://cleo-data-science/transaction_enrichment/experimental_data/caste/processed/trans_{one_date_s}_{one_date_s}"
    print(f"Load {path_file}")
    df_trans = pd.read_parquet(path_file, engine='pyarrow')
    print(f" Loaded shape {df_trans.shape}")
    sampled_df = df_trans.groupby('merchant_name_combined').apply(lambda x: x.sample(min(len(x), max_rows_per_type))).reset_index(drop=True)
    print(f" sampled shape {sampled_df.shape}")
    df_total = pd.concat([df_total,sampled_df ], axis=0)


Load s3://cleo-data-science/transaction_enrichment/experimental_data/caste/processed/trans_2024-05-14_2024-05-14
 Loaded shape (6941468, 10)
 Loaded shape (4132226, 10)
Load s3://cleo-data-science/transaction_enrichment/experimental_data/caste/processed/trans_2024-05-15_2024-05-15
 Loaded shape (3836546, 10)
 Loaded shape (2501087, 10)
Load s3://cleo-data-science/transaction_enrichment/experimental_data/caste/processed/trans_2024-05-16_2024-05-16
 Loaded shape (4203874, 9)
 Loaded shape (2688002, 9)


In [21]:
df_total['merchant_name_combined'].value_counts()[0:100]

merchant_name_combined
QuikTrip              30000
Burger King           30000
Speedway              30000
Chevron               30000
Wells Fargo           30000
                      ...  
Spotify               17337
Cash App Transfer     17156
365 Retail Markets    16533
Credit One Bank       16268
Coinbase              16245
Name: count, Length: 100, dtype: int64

: 

In [12]:
df_trans.head()

Unnamed: 0,transaction_id,corrected_made_on,amount,description_combined,merchant_name_combined,description_combined_processed,sentence,sentence2,payment_channel,currency_code
0,9857349457,2024-05-15,-372.1,Transfer to Credit Builder,Builder,Transfer to Credit Builder,Transfer to Credit Builder. Channel: None. Amo...,Transfer to Credit Builder. Type: financial_in...,other,USD
1,9857347115,2024-05-15,-32.9,DD *DOORDASH MCDONALDS,McDonald''s,DD *DOORDASH MCDONALDS,DD *DOORDASH MCDONALDS. Channel: online. Amoun...,DD *DOORDASH MCDONALDS. Type: merchant. Channe...,online,USD
2,9857383483,2024-05-15,-11.8,CIRCLE K 41641 SURPRISE AZ,Circle K,CIRCLE K SURPRISE AZ,CIRCLE K SURPRISE AZ. Channel: in store. Amoun...,CIRCLE K SURPRISE AZ. Type: merchant. Channel:...,in store,USD
3,9857383484,2024-05-15,-131.4,PPD STATE FARM RO 27 SFPP,State Farm,PPD STATE FARM RO 27 SFPP,PPD STATE FARM RO 27 SFPP. Channel: online. Am...,PPD STATE FARM RO 27 SFPP. Type: merchant. Cha...,online,USD
4,9857383485,2024-05-15,-144.9,WINCO FOODS 127 SURPRISE AZ,Winco Foods,WINCO FOODS 127 SURPRISE AZ,WINCO FOODS 127 SURPRISE AZ. Channel: in store...,WINCO FOODS 127 SURPRISE AZ. Type: merchant. C...,in store,USD


In [13]:
df_trans['merchant_name_combined'].value_counts()

merchant_name_combined
Earnin                  181549
Cash App                176072
Zelle                   102288
Amazon                   77719
McDonald''s              70846
                         ...  
Busy Exchange LLC            1
Jb''s Country Store          1
Eagle''s Tire Shop 1         1
Jersey gyros                 1
A&t Burgers                  1
Name: count, Length: 324466, dtype: int64

In [14]:
# Define the maximum number of rows per type
max_rows_per_type = 10000

# Group by the 'type' column and sample no more than 5000 rows per type
sampled_df = df_trans.groupby('merchant_name_combined').apply(lambda x: x.sample(min(len(x), max_rows_per_type))).reset_index(drop=True)


In [15]:
sampled_df.shape

(2501087, 10)