# Inicialización

In [1]:
! pip install -U transformers datasets

Collecting transformers
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2

Importación de las librerías principales

In [2]:
import pandas as pd
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from datasets import Dataset
import random
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import accelerate

In [3]:
print("CUDA disponible:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("Nombre de la GPU:", torch.cuda.get_device_name(0))
    print("Versión de CUDA:", torch.version.cuda)
else:
    print("No se detectó GPU o CUDA disponible.")

CUDA disponible: True
Nombre de la GPU: Tesla T4
Versión de CUDA: 12.1


In [None]:
if torch.cuda.is_available():
    x = torch.tensor([1.0, 2.0, 3.0], device='cuda')
    print("Tensor en GPU:", x)
else:
    print("No se detectó GPU.")

Tensor en GPU: tensor([1., 2., 3.], device='cuda:0')


In [4]:
df = pd.read_csv('data_t.csv', sep=',', decimal='.')

In [5]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,transaction_xml,transaction_html,transaction_json,transaction_natural
0,330,CASH_OUT,332720.28,C1391473624,57315.0,0.0,C1382509198,2232.13,334952.41,0,<transaction>\n <step>330</step>\n <type>CAS...,<table>\n <tr><th>Step</th><td>330</td></tr>\...,"{\n ""step"": 330,\n ""type"": ""CASH_OUT"",\n ""a...",Transaction type CASH_OUT for 332720.28 units ...
1,302,CASH_IN,108062.81,C247388028,19941838.81,20049901.62,C1736451288,4001545.8,7745868.38,0,<transaction>\n <step>302</step>\n <type>CAS...,<table>\n <tr><th>Step</th><td>302</td></tr>\...,"{\n ""step"": 302,\n ""type"": ""CASH_IN"",\n ""am...",Transaction type CASH_IN for 108062.81 units f...
2,304,TRANSFER,19730215.68,C319491154,120855.99,0.0,C778559237,21036566.97,40766782.65,0,<transaction>\n <step>304</step>\n <type>TRA...,<table>\n <tr><th>Step</th><td>304</td></tr>\...,"{\n ""step"": 304,\n ""type"": ""TRANSFER"",\n ""a...",Transaction type TRANSFER for 19730215.68 unit...
3,328,CASH_OUT,395114.5,C845451933,0.0,0.0,C929507855,1291600.37,1686714.87,0,<transaction>\n <step>328</step>\n <type>CAS...,<table>\n <tr><th>Step</th><td>328</td></tr>\...,"{\n ""step"": 328,\n ""type"": ""CASH_OUT"",\n ""a...",Transaction type CASH_OUT for 395114.5 units f...
4,259,PAYMENT,10639.48,C1715932058,25629.0,14989.52,M747000377,0.0,0.0,0,<transaction>\n <step>259</step>\n <type>PAY...,<table>\n <tr><th>Step</th><td>259</td></tr>\...,"{\n ""step"": 259,\n ""type"": ""PAYMENT"",\n ""am...",Transaction type PAYMENT for 10639.48 units fr...


In [6]:
seed=42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# Entrenamiento LLM basado en distilBERT

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Usando el dispositivo: {device}')

Usando el dispositivo: cuda


In [8]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

In [9]:
def train_model_for_column(column_name):
    print(f'\nEntrenando modelo para la columna: {column_name}')

    # Usar la columna correspondiente como texto de entrada
    df['transaction_text'] = df[column_name]
    df['label'] = df['isFraud'].astype(int)

    # Dividir el dataset en entrenamiento y prueba con estratificación
    train_df, test_df = train_test_split(
        df[['transaction_text', 'label']],
        test_size=0.2,
        random_state=42,
        stratify=df['label']
    )

    # Crear datasets de Hugging Face
    dataset_train = Dataset.from_pandas(train_df.reset_index(drop=True))
    dataset_test = Dataset.from_pandas(test_df.reset_index(drop=True))

    # Cargar el tokenizer y el modelo de DistilBERT
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
    model.to(device)

    # Tokenizar los datos
    def tokenize_function(examples):
        return tokenizer(examples['transaction_text'], padding='max_length', truncation=True, max_length=256)

    tokenized_train = dataset_train.map(tokenize_function, batched=True)
    tokenized_test = dataset_test.map(tokenize_function, batched=True)

    # Eliminar columnas innecesarias
    tokenized_train = tokenized_train.remove_columns(['transaction_text'])
    tokenized_test = tokenized_test.remove_columns(['transaction_text'])

    # Configurar el entrenamiento
    training_args = TrainingArguments(
        output_dir=f'./results_{column_name}',
        evaluation_strategy='epoch',
        save_strategy='epoch',
        learning_rate=3e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=4,
        weight_decay=0.01,
        logging_dir=f'./logs_{column_name}',
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        greater_is_better=True,
        save_total_limit=2,
        fp16=torch.cuda.is_available(),
        report_to='tensorboard',
    )

    # Configurar early stopping
    early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

    # Crear el Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping],
    )

    # Entrenar el modelo
    trainer.train()

    # Evaluar el modelo
    eval_results = trainer.evaluate()

    # Imprimir los resultados inmediatamente después de la evaluación
    print(f"\nResultados para el formato {column_name}:")
    print(f"Exactitud: {eval_results['eval_accuracy']:.4f}")
    print(f"Precisión: {eval_results['eval_precision']:.4f}")
    print(f"Recall: {eval_results['eval_recall']:.4f}")
    print(f"F1 Score: {eval_results['eval_f1']:.4f}")
    print(f"Pérdida: {eval_results['eval_loss']:.4f}")

    # Guardar el modelo entrenado y el tokenizer
    model.save_pretrained(f'./trained_model_{column_name}')
    tokenizer.save_pretrained(f'./trained_model_{column_name}')

    # Liberar memoria
    del model
    del tokenizer
    torch.cuda.empty_cache()

In [10]:
train_model_for_column('transaction_natural')


Entrenando modelo para la columna: transaction_natural


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/13480 [00:00<?, ? examples/s]

Map:   0%|          | 0/3371 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0171,0.006623,0.99911,1.0,0.998171,0.999085
2,0.0174,0.013918,0.997923,0.998778,0.996951,0.997864
3,0.0163,0.007112,0.998813,1.0,0.997561,0.998779



Resultados para el formato transaction_natural:
Exactitud: 0.9991
Precisión: 1.0000
Recall: 0.9982
F1 Score: 0.9991
Pérdida: 0.0066


In [11]:
train_model_for_column('transaction_html')


Entrenando modelo para la columna: transaction_html


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/13480 [00:00<?, ? examples/s]

Map:   0%|          | 0/3371 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.043,0.021882,0.99644,0.995134,0.997561,0.996346
2,0.0224,0.006091,0.99911,1.0,0.998171,0.999085
3,0.0222,0.006035,0.99911,1.0,0.998171,0.999085
4,0.0144,0.006224,0.99911,1.0,0.998171,0.999085



Resultados para el formato transaction_html:
Exactitud: 0.9991
Precisión: 1.0000
Recall: 0.9982
F1 Score: 0.9991
Pérdida: 0.0061


In [12]:
train_model_for_column('transaction_json')


Entrenando modelo para la columna: transaction_json


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/13480 [00:00<?, ? examples/s]

Map:   0%|          | 0/3371 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0207,0.008151,0.998813,1.0,0.997561,0.998779
2,0.019,0.008102,0.998813,0.999389,0.998171,0.99878
3,0.0157,0.007779,0.998517,0.999389,0.997561,0.998474
4,0.0077,0.007147,0.998813,0.999389,0.998171,0.99878



Resultados para el formato transaction_json:
Exactitud: 0.9988
Precisión: 0.9994
Recall: 0.9982
F1 Score: 0.9988
Pérdida: 0.0081


In [13]:
train_model_for_column('transaction_xml')


Entrenando modelo para la columna: transaction_xml


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/13480 [00:00<?, ? examples/s]

Map:   0%|          | 0/3371 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0305,0.00596,0.99911,1.0,0.998171,0.999085
2,0.0241,0.005945,0.99911,1.0,0.998171,0.999085
3,0.0175,0.005204,0.99911,1.0,0.998171,0.999085



Resultados para el formato transaction_xml:
Exactitud: 0.9991
Precisión: 1.0000
Recall: 0.9982
F1 Score: 0.9991
Pérdida: 0.0060


# Entrenamiento regresión logística

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.linear_model import LogisticRegression
import tensorflow as tf

In [None]:
df = pd.read_csv('data_t.csv', sep=',', decimal='.')

In [None]:
# Crear instancias de LabelEncoder para cada columna categórica
label_encoder_type = LabelEncoder()
label_encoder_orig = LabelEncoder()
label_encoder_dest = LabelEncoder()

# Codificar las columnas categóricas
df['type_encoded'] = label_encoder_type.fit_transform(df['type'])
df['nameOrig_encoded'] = label_encoder_orig.fit_transform(df['nameOrig'])
df['nameDest_encoded'] = label_encoder_dest.fit_transform(df['nameDest'])

In [None]:
features = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'type_encoded', 'nameOrig_encoded', 'nameDest_encoded']

X = df[features]
y = df['isFraud']

seed = 42

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, stratify=y)

In [None]:
numeric_features = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
scaler = StandardScaler()

# Aplicar el escalado al conjunto de entrenamiento y luego al de prueba
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

In [None]:
# Entrenar modelo
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Realizar predicciones
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

In [None]:
print(f"Exactitud: {accuracy:.4f}")
print(f"Precisión: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Exactitud: 0.8980
Precisión: 0.9355
Recall: 0.8488
F1 Score: 0.8900


# Entrenamiento red neuronal

In [None]:
# Crear instancias de LabelEncoder para cada columna categórica
label_encoder_orig = LabelEncoder()
label_encoder_dest = LabelEncoder()
df['nameOrig_encoded'] = label_encoder_orig.fit_transform(df['nameOrig'])
df['nameDest_encoded'] = label_encoder_dest.fit_transform(df['nameDest'])

# Aplicar OneHotEncoding a la columna 'type'
df = pd.get_dummies(df, columns=['type'])

In [None]:
# Seleccionar las características relevantes
features = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest',
            'nameOrig_encoded', 'nameDest_encoded'] + [col for col in df.columns if col.startswith('type_')]
target = 'isFraud'

X = df[features]
y = df[target]

In [None]:
# Escalar las características numéricas
scaler = StandardScaler()
X = scaler.fit_transform(X)

seed=42

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, stratify=y)

In [None]:
# Crear el modelo
model = tf.keras.models.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compilar el modelo
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Entrenar el modelo
history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2, verbose=1)



Epoch 1/20
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.7656 - loss: 0.4867 - val_accuracy: 0.8654 - val_loss: 0.2895
Epoch 2/20
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8740 - loss: 0.2727 - val_accuracy: 0.8909 - val_loss: 0.2307
Epoch 3/20
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8965 - loss: 0.2185 - val_accuracy: 0.8991 - val_loss: 0.2119
Epoch 4/20
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9041 - loss: 0.2093 - val_accuracy: 0.9073 - val_loss: 0.1987
Epoch 5/20
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9101 - loss: 0.1990 - val_accuracy: 0.9069 - val_loss: 0.1946
Epoch 6/20
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9082 - loss: 0.1996 - val_accuracy: 0.9117 - val_loss: 0.1901
Epoch 7/20
[1m169/169[0m 

In [None]:
# Evaluar el modelo
y_pred_train = (model.predict(X_train) > 0.5).astype("int32")
y_pred_test = (model.predict(X_test) > 0.5).astype("int32")

# Calcular métricas
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_test, average='binary')

[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [None]:
print(f"Exactitud: {test_accuracy:.4f}")
print(f"Precisión: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Exactitud: 0.9362
Precisión: 0.9564
Recall: 0.9104
F1 Score: 0.9328
