### https://www.kaggle.com/datasets/computingvictor/transactions-fraud-datasets/data

## Importing .csv files into 1000-row-dataframes

In [1]:
import pandas as pd

df_cards_data = pd.read_csv('cards_data.csv', nrows=1000, encoding='latin1', on_bad_lines='skip')
df_users_data = pd.read_csv('users_data.csv', nrows=1000, encoding='latin1', on_bad_lines='skip')
df_transactions_data = pd.read_csv('transactions_data.csv', nrows=1000, encoding='latin1', on_bad_lines='skip')

In [2]:
df_cards_data.head()

Unnamed: 0,id,client_id,card_brand,card_type,card_number,expires,cvv,has_chip,num_cards_issued,credit_limit,acct_open_date,year_pin_last_changed,card_on_dark_web
0,4524,825,Visa,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
1,2731,825,Visa,Debit,4956965974959986,12/2020,393,YES,2,$21968,04/2014,2014,No
2,3701,825,Visa,Debit,4582313478255491,02/2024,719,YES,2,$46414,07/2003,2004,No
3,42,825,Visa,Credit,4879494103069057,08/2024,693,NO,1,$12400,01/2003,2012,No
4,4659,825,Mastercard,Debit (Prepaid),5722874738736011,03/2009,75,YES,1,$28,09/2008,2009,No


In [3]:
df_users_data.head().sort_values(by="id")

Unnamed: 0,id,current_age,retirement_age,birth_year,birth_month,gender,address,latitude,longitude,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards
3,708,63,63,1957,1,Female,3 Madison Street,40.71,-73.99,$163145,$249925,$202328,722,4
0,825,53,66,1966,11,Female,462 Rose Lane,34.15,-117.76,$29278,$59696,$127613,787,5
4,1164,43,70,1976,9,Male,9620 Valley Stream Drive,37.76,-122.44,$53797,$109687,$183855,675,1
2,1718,81,67,1938,11,Female,766 Third Drive,34.02,-117.89,$22681,$33483,$196,698,5
1,1746,53,68,1966,12,Female,3606 Federal Boulevard,40.76,-73.74,$37891,$77254,$191349,701,5


In [4]:
df_transactions_data.head()

Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,mcc,errors
0,7475327,2010-01-01 00:01:00,1556,2972,$-77.00,Swipe Transaction,59935,Beulah,ND,58523.0,5499,
1,7475328,2010-01-01 00:02:00,561,4575,$14.57,Swipe Transaction,67570,Bettendorf,IA,52722.0,5311,
2,7475329,2010-01-01 00:02:00,1129,102,$80.00,Swipe Transaction,27092,Vista,CA,92084.0,4829,
3,7475331,2010-01-01 00:05:00,430,2860,$200.00,Swipe Transaction,27092,Crown Point,IN,46307.0,4829,
4,7475332,2010-01-01 00:06:00,848,3915,$46.41,Swipe Transaction,13051,Harwood,MD,20776.0,5813,


## Importing .json files into 1000-item-dataframes

### mcc_codes.json

In [5]:
import pandas as pd
import json

# 1. JSON fájl beolvasása Python dict-be
with open('mcc_codes.json', 'r', encoding='utf-8') as f:
    mcc_dict = json.load(f)

# 2. Dict átalakítása DataFrame-é
df_mcc_codes = pd.DataFrame(list(mcc_dict.items()), columns=['MCC', 'Description'])

# 3. Ellenőrzés
print(df_mcc_codes.head())

    MCC                           Description
0  5812         Eating Places and Restaurants
1  5541                      Service Stations
2  7996  Amusement Parks, Carnivals, Circuses
3  5411          Grocery Stores, Supermarkets
4  4784                 Tolls and Bridge Fees


In [6]:
df_mcc_codes.shape

(109, 2)

### train_fraud_labels.json

In [7]:
import os

print("Fájl mérete MB-ban:", os.path.getsize('train_fraud_labels.jsonl') / (1024*1024))

with open('train_fraud_labels.jsonl', 'rb') as f:
    chunk = f.read(1000)
    print("Első 1000 byte:")
    print(chunk)

Fájl mérete MB-ban: 287.7449836730957
Első 1000 byte:
b'{"id": "10649266", "label": "No"}\n{"id": "23410063", "label": "No"}\n{"id": "9316588", "label": "No"}\n{"id": "12478022", "label": "No"}\n{"id": "9558530", "label": "No"}\n{"id": "12532830", "label": "No"}\n{"id": "19526714", "label": "No"}\n{"id": "9906964", "label": "No"}\n{"id": "13224888", "label": "No"}\n{"id": "13749094", "label": "No"}\n{"id": "12303776", "label": "No"}\n{"id": "19480376", "label": "No"}\n{"id": "11716050", "label": "No"}\n{"id": "20025400", "label": "No"}\n{"id": "7661688", "label": "No"}\n{"id": "16662807", "label": "No"}\n{"id": "21419778", "label": "No"}\n{"id": "18011186", "label": "No"}\n{"id": "23289598", "label": "No"}\n{"id": "11644547", "label": "No"}\n{"id": "23235120", "label": "No"}\n{"id": "19748218", "label": "No"}\n{"id": "8720720", "label": "No"}\n{"id": "18335831", "label": "No"}\n{"id": "18936727", "label": "No"}\n{"id": "15223870", "label": "No"}\n{"id": "12370203", "label": "No"}\n{"id

In [8]:
# Fájlszerkezet vizsgálata
import ijson

with open('train_fraud_labels.json', 'rb') as f:
    parser = ijson.parse(f)

    for prefix, event, value in parser:
        print(prefix, event, value)
        # álljunk le 30 token után
        if prefix.count('.') > 0:
            break


 start_map None
 map_key target
target start_map None
target map_key 10649266
target.10649266 string No


In [13]:
import pandas as pd

df_head = pd.read_json("train_fraud_labels.jsonl", lines=True, nrows=5)
df_head

Unnamed: 0,id,label
0,10649266,No
1,23410063,No
2,9316588,No
3,12478022,No
4,9558530,No


In [1]:
import pandas as pd
df = pd.read_json("data/train_fraud_labels.jsonl", lines=True, nrows=100000)
df.isnull().sum()

id       0
label    0
dtype: int64