### Imports
---

In [1]:
import pandas as pd
from faker import Faker
import random
import pandas as pd
from tqdm import tqdm

### Load data
---

In [None]:
dataframe = pd.read_csv("src/data/HI-Small_Trans.csv")

In [None]:
dataframe.head(2)

In [None]:
dataframe.shape

### Transformations
---

In [None]:
filtered_dataframe = dataframe[dataframe["Amount Paid"] == dataframe["Amount Received"]]

In [None]:
filtered_dataframe.columns = filtered_dataframe.columns.str.lower()
filtered_dataframe.columns = filtered_dataframe.columns.str.replace(' ', "_")

In [None]:
filtered_dataframe[["from_bank", "to_bank"]] = filtered_dataframe[["from_bank", "to_bank"]].astype(str)

In [None]:
filtered_dataframe.head(2)

In [None]:
filtered_dataframe["sender"] = filtered_dataframe["from_bank"] + "_" + filtered_dataframe["account"]
filtered_dataframe["receiver"] = filtered_dataframe["to_bank"] + "_" + filtered_dataframe["account.1"]

In [None]:
filtered_dataframe = filtered_dataframe[[
 "timestamp",
 "sender",
 "receiver",
 "amount_received",
 "receiving_currency",
 "amount_paid",
 "payment_currency",
 "payment_format",
 "is_laundering",
]]

In [None]:
filtered_dataframe = filtered_dataframe.query("sender != receiver")

In [None]:
filtered_dataframe.to_csv("src/data/full_transactions_data.csv", index=False)

In [None]:
filtered_dataframe = filtered_dataframe.groupby(["sender", "receiver"], as_index=False).agg({"amount_paid":  "sum"})
filtered_dataframe.columns = ['sender','receiver','sum']
filtered_dataframe = filtered_dataframe.rename(columns={"sum": "value"})
filtered_dataframe["title"] = filtered_dataframe.apply(lambda df: f"from: {df.sender}\nto: {df.receiver}\namount (BRL): {df.value}", axis=1)

In [None]:
filtered_dataframe.to_csv("src/data/network_analysis_data.csv", index=False)

### Generate fake customer data
---

In [7]:
all_clients = pd.read_csv("src/data/full_transactions_data.csv")

In [8]:
all_clients = all_clients.groupby(by="sender", as_index=False).timestamp.count().sort_values("timestamp", ascending=False)

In [9]:
all_clients_id = set(all_clients["sender"])

In [3]:
fake = Faker()
Faker.seed(42)

In [6]:
fake.phone_number()

'001-740-326-5423'

In [10]:
data = []
bar = tqdm(total=len(all_clients_id))
for client_id in all_clients_id:
    location = fake.location_on_land()
    data.append({
        "account_id": client_id,
        "name": fake.name(),
        "email": fake.email(),
        "mobile": fake.phone_number(),
        "location": f"{location[2]} - {location[3]} - {location[4]}",
        "registration_date": fake.date_this_year(),
    })
    bar.update(1)
df = pd.DataFrame(data)

100%|█████████▉| 305614/305756 [01:18<00:00, 3908.23it/s]

In [11]:
df.head()

Unnamed: 0,account_id,name,email,mobile,location,registration_date
0,343224_810086CB0,Ian Cooper,lindsay78@example.org,(518)349-5931x0341,Markham - CA - America/Toronto,2025-01-19
1,313465_80B5360C0,Roy Martin,jason41@example.net,+1-228-732-7648x3503,Saint-Omer - FR - Europe/Paris,2025-01-20
2,117_80E296B70,Thomas Bradley,jason76@example.net,724.523.8849x696,el Camp de l'Arpa del Clot - ES - Europe/Madrid,2025-01-07
3,2439_810E3CA30,Peter Callahan Jr.,laurahenderson@example.org,(669)878-4801x8451,Eisen - KR - Asia/Seoul,2025-01-08
4,1024_800ECB1A0,Nathan Cortez,williamrodriguez@example.net,289-332-5288x0957,Agrigento - IT - Europe/Rome,2025-02-15


In [12]:
df.to_csv('src/data/accounts_details.csv', index=False)