### Imports
---

In [1]:
import pandas as pd
from faker import Faker
import random
import pandas as pd
from tqdm import tqdm

### Load data
---

In [2]:
dataframe = pd.read_csv("src/data/HI-Small_Trans.csv")

In [3]:
dataframe.shape

(5078345, 11)

### Transformations
---

In [4]:
dataframe.columns = dataframe.columns.str.lower()
dataframe.columns = dataframe.columns.str.replace(' ', "_")

In [5]:
dataframe[["from_bank", "to_bank"]] = dataframe[["from_bank", "to_bank"]].astype(str)

In [6]:
dataframe.head(2)

Unnamed: 0,timestamp,from_bank,account,to_bank,account.1,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering
0,2022/09/01 00:20,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0
1,2022/09/01 00:20,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0


In [7]:
dataframe["sender"] = dataframe["from_bank"] + "_" + dataframe["account"]
dataframe["receiver"] = dataframe["to_bank"] + "_" + dataframe["account.1"]

In [8]:
filtered_dataframe = dataframe[dataframe["payment_format"] != "Reinvestment"]

In [9]:
filtered_dataframe = filtered_dataframe[[
 "timestamp",
 "sender",
 "receiver",
 "amount_received",
 "receiving_currency",
 "amount_paid",
 "payment_currency",
 "payment_format",
 "is_laundering",
]]

In [10]:
filtered_dataframe.to_csv("src/data/full_transactions_data.csv", index=False)

In [11]:
filtered_dataframe = filtered_dataframe.groupby(["sender", "receiver"], as_index=False).agg({"amount_paid":  "sum"})
filtered_dataframe.columns = ['sender','receiver','sum']
filtered_dataframe = filtered_dataframe.rename(columns={"sum": "value"})
filtered_dataframe["title"] = filtered_dataframe.apply(lambda df: f"from: {df.sender}\nto: {df.receiver}\namount (BRL): {df.value}", axis=1)

In [12]:
filtered_dataframe.to_csv("src/data/network_analysis_data.csv", index=False)

### Generate fake customer data
---

In [None]:
all_clients = pd.read_csv("src/data/full_transactions_data.csv")

In [None]:
all_clients = all_clients.groupby(by="sender", as_index=False).timestamp.count().sort_values("timestamp", ascending=False)

In [None]:
all_clients_id = set(all_clients["sender"])

In [None]:
fake = Faker()
Faker.seed(42)

In [None]:
fake.phone_number()

In [None]:
data = []
bar = tqdm(total=len(all_clients_id))
for client_id in all_clients_id:
    location = fake.location_on_land()
    data.append({
        "account_id": client_id,
        "name": fake.name(),
        "email": fake.email(),
        "mobile": fake.phone_number(),
        "location": f"{location[2]} - {location[3]} - {location[4]}",
        "registration_date": fake.date_this_year(),
    })
    bar.update(1)
df = pd.DataFrame(data)

In [None]:
df.head()

In [None]:
df.to_csv('src/data/accounts_details.csv', index=False)