# Load your data

The code below will download all the provided data. Sit back and relax while Google Colab takes care of it.

In [1]:
"""%%capture"""

# install and load necessary Libraries
"""%%pip install pyarrow
%pip install gdown"""

import gdown
import pandas as pd

In [2]:
data = {
    'bank': "1dzL_SWBkBs5xrUxuGQTm04oe3USgkL9u",    # banking data
    'sales': "1QK-VgSU3AxXUw330KjYFUj8S9hzKJsG6",   # sales data
    'mcc': "1JN0bR84sgZ_o4wjKPBUmz45NeEEkVgt7",     # mcc description
}

In [3]:
# Download all files from Google Drive
for name, file_id in data.items():
    gdown.download(f'https://drive.google.com/uc?id={file_id}', name + '.parquet', quiet=False) 

Downloading...
From: https://drive.google.com/uc?id=1dzL_SWBkBs5xrUxuGQTm04oe3USgkL9u
To: c:\Users\felipe\Documents\Hackaton\C-shark-solution\bank.parquet
100%|██████████| 1.57M/1.57M [00:00<00:00, 3.81MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QK-VgSU3AxXUw330KjYFUj8S9hzKJsG6
To: c:\Users\felipe\Documents\Hackaton\C-shark-solution\sales.parquet
100%|██████████| 6.37M/6.37M [00:01<00:00, 5.97MB/s]
Downloading...
From: https://drive.google.com/uc?id=1JN0bR84sgZ_o4wjKPBUmz45NeEEkVgt7
To: c:\Users\felipe\Documents\Hackaton\C-shark-solution\mcc.parquet
100%|██████████| 57.3k/57.3k [00:00<00:00, 783kB/s]


In [4]:
# Read all files and store on a dictionary of pandas dataframes
df = {}
for name in data.keys():
    df[name] = pd.read_parquet(name + '.parquet')

# View your data

You can access each dataframe inside the `df` dictionary using the code shown below

In [19]:
df['sales'].sort_values(by="card_number", ascending = True).loc[df["sales"]["card_number"]!= "",:]

Unnamed: 0,document_id,date_time,value,card_number,type,mcc,state
250724,8973801048960393711,2023-02-22 02:58:43.896700,607.20,1000164842220266112,Débito,5812,RS
109613,6125423161866271849,2023-04-14 23:05:29.130900,175.56,1000164842220266112,Débito,5812,PE
243475,5119714663190205160,2023-04-21 01:10:05.666500,275.96,1000184940349350528,Crédito,5812,ES
129669,3015231668776975996,2023-04-26 00:33:49.952200,40.00,1000216457854768000,Débito,5912,PB
77703,4619042854388500300,2023-05-21 05:44:51.224200,155.56,1000297234729290496,Débito,5921,RJ
...,...,...,...,...,...,...,...
151127,8648043700133268622,2022-12-25 16:00:49.663100,226.60,999604941954856320,Débito,5651,BA
13980,7267264743176775191,2022-12-09 20:16:18.602000,176.00,999681003681560704,Crédito,5812,RJ
14052,7267264743176775191,2022-12-05 19:13:23.868800,146.00,999688527568146048,Crédito,5812,RJ
236699,2277375648452898776,2023-04-12 19:31:49.705600,108.00,999979815675401344,Débito,5999,SP


In [6]:
df['bank']

Unnamed: 0,document_id,date_time,value,counterparty_document,type
0,5615027685943047372,2023-02-08 19:02:36.289545,400000.00,2701672467485454263,pix_in
1,6321206883189082161,2023-05-21 17:45:10.407340,330000.00,5674766186099233601,pix_in
2,6204525363384429949,2023-05-19 14:53:21.567099,200000.00,2193750750108086695,pix_out
3,6347736874608223396,2023-04-05 12:13:38.056087,200000.00,904790816053028747,pix_out
4,6347736874608223396,2023-04-07 23:44:04.727672,200000.00,904790816053028747,pix_out
...,...,...,...,...,...
66184,4176676849454697385,2023-01-17 12:27:44.012951,0.88,6015991367796145791,pix_in
66185,4176676849454697385,2022-12-23 16:38:35.784940,0.80,6015991367796145791,pix_out
66186,6367852746770439350,2023-01-09 05:49:23.584379,0.80,5566228780935135215,pix_in
66187,5952522905932206715,2022-12-21 22:14:41.099309,0.60,3467101161978958464,pix_in


In [7]:
df['mcc']

Unnamed: 0,mcc,edited_description,combined_description,usda_description,irs_description,irs_reportable
0,742,Veterinary Services,Veterinary Services,Veterinary Services,Veterinary Services,Yes
1,763,Agricultural Co-operatives,Agricultural Co-operatives,Agricultural Co-operatives,Agricultural Cooperative,Yes
2,780,"Horticultural Services, Landscaping Services","Horticultural Services, Landscaping Services",Horticultural Services,Landscaping Services,Yes
3,1520,General Contractors-Residential and Commercial,General Contractors-Residential and Commercial,General Contractors-Residential and Commercial,General Contractors,Yes
4,1711,Air Conditioning Contractors – Sales and Insta...,Air Conditioning Contractors – Sales and Insta...,Air Conditioning Contractors – Sales and Insta...,"Heating, Plumbing, A/C",Yes
...,...,...,...,...,...,...
976,9405,Intra – Government Transactions,Intra – Government Transactions,Intra – Government Transactions,U.S. Federal Government Agencies or Departments,No1.6041-3(p)(3)
977,9700,Automated Referral Service ( For Visa Only),Automated Referral Service ( For Visa Only),Automated Referral Service ( For Visa Only),,
978,9701,Visa Credential Service ( For Visa Only),Visa Credential Service ( For Visa Only),Visa Credential Service ( For Visa Only),,
979,9702,GCAS Emergency Services ( For Visa Only),GCAS Emergency Services ( For Visa Only),GCAS Emergency Services ( For Visa Only),,


# Explore your data

You can experiment with your data to gain insights into customer behavior.

In [8]:
summary = df['bank'].groupby('document_id')['value'].agg(['max', 'mean'])

summary

Unnamed: 0_level_0,max,mean
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1898616423937443,36640.00,853.016772
10825678225686420,5600.00,439.836197
40944759436046065,1436.00,266.282569
150845804022381629,16234.56,478.768867
272221220503584164,8240.00,650.473315
...,...,...
8836929473261798698,21125.44,1053.238347
8973801048960393711,40000.00,3789.038075
8997352755884024136,12000.00,902.479651
9132021237731236867,18898.64,702.663692


In [9]:
summary = df['sales'].groupby('document_id')['value'].agg(['max', 'mean'])
summary

Unnamed: 0_level_0,max,mean
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1898616423937443,451.40,28.742321
10825678225686420,200000.00,458.767231
40944759436046065,16000.00,272.798879
150845804022381629,30964453.32,16133.606750
272221220503584164,3680.00,165.718302
...,...,...
8836929473261798698,16246.64,212.363954
8973801048960393711,2228.60,434.943573
8997352755884024136,1850.24,243.359745
9132021237731236867,5880.00,104.118340


In [None]:
summary