In [1]:
import pandas as pd
import numpy as np
import random

In [50]:
def gerar_dataframe_amostra(num_rows):
    """Gera um DataFrame simulando transações.

    Args:
        num_rows: Número de linhas (transações) a serem geradas.
        sender_pool: Lista de possíveis remetentes.
        receiver_pool: Lista de possíveis destinatários.
        amount_range: Tupla indicando o intervalo mínimo e máximo do valor da transação.
        date_range: Tupla indicando a data inicial e final do intervalo de datas.

    Returns:
        Um DataFrame Pandas com as colunas 'sender', 'receiver', 'amount' e 'timestamp'.
    """
    # Exemplo de uso:
    sender_pool = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
    receiver_pool = sender_pool.copy()
    amount_range = (1000, 50000)
    start_date, end_date = '2023-01-01', '2023-12-31'

    # Gerando dados aleatórios
    senders = [random.choice(sender_pool) for _ in range(num_rows)]
    receivers = [random.choice(receiver_pool) for _ in range(num_rows)]
    amounts = [random.randint(*amount_range) for _ in range(num_rows)]
    timestamps = pd.to_datetime(np.random.choice(pd.date_range(start=start_date, end=end_date), size=num_rows))
    fraud_label = np.zeros(num_rows)

    

    # Criando o DataFrame
    df = pd.DataFrame({
        'sender': senders,
        'receiver': receivers,
        'amount': amounts,
        'timestamp': timestamps,
        'fraud_label': fraud_label
    })

    for i in range(num_rows):
        while df.loc[i, 'sender'] == df.loc[i, 'receiver']:
            df.loc[i, 'receiver'] = random.choice(receivers)

    for _ in range(int(num_rows*0.3)):
        i = np.random.choice(num_rows)
        df.loc[i, 'amount'] = random.randint(100, 350)
        df.loc[i, 'fraud_label'] = 1
    
    return df


In [49]:
np.zeros(10)[1]

np.float64(0.0)

In [26]:
def generate_similar_dataset(size):
    """
    Generate a dataset with a similar structure and distribution to the sample_data.
    
    Parameters:
        size (int): The number of rows in the generated dataset.
    
    Returns:
        pd.DataFrame: A DataFrame with randomly generated data.
    """
    # Define the base characteristics from the original dataset
    senders = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
    receivers = ['B', 'C', 'A', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
    amount_min, amount_max = 300, 5000  # Range of amounts in the original dataset
    start_date, end_date = '2023-01-01', '2023-12-31'
    
    # Generate random data
    data = {
        'sender': [random.choice(senders) for _ in range(size)],
        'receiver': [random.choice(receivers) for _ in range(size)],
        'amount': np.random.randint(amount_min, amount_max + 1, size=size),
        'timestamp': pd.to_datetime(np.random.choice(
            pd.date_range(start=start_date, end=end_date), size=size))
    }
    
    # Ensure sender != receiver for each transaction
    for i in range(size):
        while data.loc[i, 'sender'] == data.loc[i, 'receiver']:
            data.loc[i, 'receiver'] = random.choice(receivers)
    
    return pd.DataFrame(data)

In [7]:
new_dataset = generate_similar_dataset(15)
new_dataset

Unnamed: 0,sender,receiver,amount,timestamp
0,C,F,3495,2023-07-06
1,D,F,1938,2023-03-03
2,F,C,3086,2023-12-09
3,J,D,4113,2023-09-23
4,C,E,2021,2023-08-06
5,A,E,3751,2023-12-05
6,D,I,463,2023-04-08
7,J,H,4861,2023-01-24
8,J,A,357,2023-08-21
9,C,B,3752,2023-07-27


In [32]:

# Exemplo de uso:
num_rows = 15
sender_pool = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
receiver_pool = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
amount_range = (100, 5000)
date_range = ('2023-01-01', '2023-12-31')



In [51]:
df = gerar_dataframe_amostra(num_rows)


In [52]:
df

Unnamed: 0,sender,receiver,amount,timestamp,fraud_label
0,C,N,208,2023-05-26,1.0
1,E,N,47741,2023-09-23,0.0
2,U,B,202,2023-03-26,1.0
3,L,E,8107,2023-02-13,0.0
4,R,Q,18107,2023-10-26,0.0
5,M,R,5832,2023-12-14,0.0
6,W,M,35841,2023-05-10,0.0
7,O,X,38881,2023-02-27,0.0
8,J,V,6550,2023-06-03,0.0
9,V,B,38013,2023-09-14,0.0
