In [6]:
import pandas as pd
from datetime import datetime
import boto3
from botocore.exceptions import ClientError
from io import StringIO
import s3fs
from fastparquet import ParquetFile
from cleodata.utils.secrets import get_secret
from cleodata.sources.sync.sync import SyncDataSource
boto3.setup_default_session(profile_name='DataScientist-878877078763')
redshift_source = SyncDataSource("data_exploration", use_redshift=True, redshift_cluster="cleo-production-redshift", redshift_db="cleo")

[2m2024-05-20 13:01:45[0m [[32m[1mdebug    [0m] [1mfetching credentials          [0m
[2m2024-05-20 13:01:46[0m [[32m[1minfo     [0m] [1mCredentials acquired          [0m
[2m2024-05-20 13:01:46[0m [[32m[1minfo     [0m] [1mBuilt connection pool         [0m


In [7]:
def read_from_s3(path):
    """Read parquet files and combine them into a single dataframe"""
    fs = s3fs.core.S3FileSystem()
    all_paths_from_s3 = fs.glob(path=f"{path}*.parquet")

    if len(all_paths_from_s3) > 0:
        s3 = s3fs.S3FileSystem()
        fp_obj = ParquetFile(
            all_paths_from_s3, open_with=s3.open
        )  # use s3fs as the filesystem
        data = fp_obj.to_pandas()
        return data

In [30]:



path  = "s3://cleo-data-science/merchant-deduplication/data/test_pairs_20221118"
df_data_raw = read_from_s3(path)

In [31]:
df_data_raw.head(20)

Unnamed: 0,merchant_id,merchant_name,matched_merchant_name,source,label
0,1081362,sprint,sprint wireless,dedup-tool,1
1,1081362,sprint,sprint *wireles,dedup-tool,1
2,1081362,sprint,sprint prepaid,dedup-tool,1
3,1081362,sprint,sprint wir,dedup-tool,1
4,1081362,sprint,sprint phone,dedup-tool,1
5,1308542,price chopper,price chop,dedup-tool,1
6,1308542,price chopper,price choper,dedup-tool,1
7,1308542,price chopper,price chop per 36,dedup-tool,1
8,1308542,price chopper,price choper 250,dedup-tool,1
9,1308542,price chopper,price chop per 24,dedup-tool,1


In [32]:
df_data_raw['label'].value_counts()

label
1    34534
0    22172
Name: count, dtype: int64

In [20]:
df_data_raw[df_data_raw['label']==0]

Unnamed: 0,merchant_id,merchant_name,matched_merchant_name,source,label
579,1338235,shop n go,shop n go conveniesmithfield,dedup-tool,0
580,1338235,shop n go,shop n go convenience,dedup-tool,0
581,1338235,shop n go,shop n go grocery,dedup-tool,0
582,1338235,shop n go,shop n go corner,dedup-tool,0
583,1338235,shop n go,shop n go marketupland,dedup-tool,0
...,...,...,...,...,...
47503,10945207,albert park visa direct,albert,dedup-tool,0
47504,10945207,oak park albert,albert,dedup-tool,0
47505,10945207,e carl albert,albert,dedup-tool,0
47506,10945207,cash app albert rod,albert,dedup-tool,0


In [22]:
df_data_raw[df_data_raw['label']==0][30:60]

Unnamed: 0,merchant_id,merchant_name,matched_merchant_name,source,label
609,3315607,chumba gold,globalpok gold coita xbi,dedup-tool,0
610,3315607,chumba gold,coin coin,dedup-tool,0
611,3315607,chumba gold,coin,dedup-tool,0
612,3315607,chumba gold,chuckchansi gold,dedup-tool,0
613,3315607,chumba gold,gold cash gold detroit,dedup-tool,0
614,3315607,chumba gold,mmorpg gold,dedup-tool,0
615,3315607,chumba gold,aribagold gold,dedup-tool,0
616,3315607,chumba gold,globalpok gold co igp@vgw,dedup-tool,0
617,3315607,chumba gold,gold coast hotel gold,dedup-tool,0
618,3315607,chumba gold,solid gold,dedup-tool,0


In [28]:



path  = "s3://cleo-data-science/merchant-deduplication/batch_run/input/221108_candidates"
df_data_batch_run = read_from_s3(path)

In [29]:
df_data_batch_run.head(30)

Unnamed: 0,merchant_id,merchant_name,matched_merchant_id,matched_merchant_name
0,7148738,rollin fresh ha,14628471,rollin fresh madiso
1,7148738,rollin fresh ha,7948049,rollin' fresh
2,7148738,rollin fresh ha,7979416,rollin'
3,7148738,rollin fresh ha,6886987,sq *rollin
4,7148738,rollin fresh ha,7101437,rollin o
5,7148738,rollin fresh ha,10530599,rollin in the franklin
6,7148738,rollin fresh ha,4964440,rollin 253 fircrest
7,7148738,rollin fresh ha,7105680,rollin oats mar
8,7148738,rollin fresh ha,15365609,rollin n flava
9,7148738,rollin fresh ha,6908168,rollin oats m


In [33]:
path  = "s3://cleo-data-science/merchant-deduplication/batch_run/input/batch_run_data"
df_data_batch_run_data = read_from_s3(path)

In [34]:
df_data_batch_run_data

Unnamed: 0,merchant_id,merchant_name,matched_merchant_id,matched_merchant_name
0,1076618,ulta,7786726,ulta 323
1,1076618,ulta,7731665,ulta 225
2,1076618,ulta,7837915,ulta 757
3,1076618,ulta,7749578,ulta 582
4,1076618,ulta,7737771,ulta 495
...,...,...,...,...
71828,1040542,sonic,8350137,return sonic
71829,1040542,sonic,9068227,favor sonic dri
71830,1040542,sonic,7897381,76 sonic food m
71831,1040542,sonic,2504938,tx3206 sonic dri


In [35]:
df_data_batch_run_data[0:30]

Unnamed: 0,merchant_id,merchant_name,matched_merchant_id,matched_merchant_name
0,1076618,ulta,7786726,ulta 323
1,1076618,ulta,7731665,ulta 225
2,1076618,ulta,7837915,ulta 757
3,1076618,ulta,7749578,ulta 582
4,1076618,ulta,7737771,ulta 495
5,1076618,ulta,7743049,ulta 390
6,1076618,ulta,7790975,ulta 358
7,1076618,ulta,7762925,ulta 697
8,1076618,ulta,7799840,ulta 239
9,1076618,ulta,7791626,ulta 154
