In [1]:
# Celda 1: Imports y configuración de Bedrock y rutas S3
import boto3
import pandas as pd
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

BEDROCK = boto3.client('bedrock-runtime', region_name='us-east-1')
MODEL_ID = 'amazon.nova-micro-v1:0'

S3_BUCKET = 'preauth-challenge-ai-20252'
S3_INPUT = f's3://{S3_BUCKET}/generated_data/credir_risk_reto_generated.csv'
S3_OUTPUT = f's3://{S3_BUCKET}/final_data/credir_risk_reto_classified.csv'


In [2]:
# Celda 2: Carga del dataset con descripciones
df = pd.read_csv(S3_INPUT)
print(f"Registros cargados: {len(df)}")
print(df.head(3))

Registros cargados: 1000
   Age     Sex  Job Housing Saving accounts Checking account  Credit amount  \
0   67    male    2     own             NaN           little           1169   
1   22  female    2     own          little         moderate           5951   
2   49    male    1     own          little              NaN           2096   

   Duration    Purpose                                        Description  
0         6   radio/TV  The 67-year-old male customer, employed and re...  
1        48   radio/TV  The credit risk profile for this 22-year-old f...  
2        12  education  The credit risk profile for this 49-year-old m...  


In [3]:
# Cell 3: Classification and parallelization function
import time
import random

def classify_credit_risk(description, client, model_id, max_retries=5):
    if not isinstance(description, str) or len(description) < 10:
        return 'error_no_description'
    
    prompt = (
        "Your only task is to classify the provided text. "
        "You must only respond strictly with 'good risk' or 'bad risk'.\n\n"
        f"Description:\n\"{description}\""
    )
    msg = {'role': 'user', 'content': [{'text': prompt}]}
    
    for attempt in range(max_retries):
        try:
            resp = client.converse(
                modelId=model_id,
                messages=[msg],
                inferenceConfig={'maxTokens': 5, 'temperature': 1.0, 'topP': 0.9}
            )
            lbl = resp['output']['message']['content'][0]['text'].strip().lower()
            return lbl if lbl in ('good risk', 'bad risk') else 'error_parsing'
        except Exception as e:
            if 'ThrottlingException' in str(e):
                wait_time = (2 ** attempt) + random.uniform(0, 1)
                print(f"ThrottlingException detectada. Reintentando en {wait_time:.2f} segundos...")
                time.sleep(wait_time)
            else:
                print(f"Error inesperado en la API: {e}")
                return 'error_api_call' 
                
    return 'error_max_retries'

def process_item(pair):
    idx, row = pair
    label = classify_credit_risk(row.Description, BEDROCK, MODEL_ID)
    return idx, label

def classify_parallel(df, workers=10):
    total = len(df)
    print(f"Iniciando clasificación de {total} descripciones en paralelo...")
    labels = [None] * total
    with ThreadPoolExecutor(max_workers=workers) as exe:
        futures = {exe.submit(process_item, item): item[0] for item in df.iterrows()}
        count = 0
        for f in as_completed(futures):
            idx, lbl = f.result()
            labels[idx] = lbl
            count += 1
            if count % 100 == 0 or count == total:
                print(f"  Procesadas {count}/{total}")
    return labels


In [4]:
df['Target'] = classify_parallel(df, workers=10)
print("Distribución de etiquetas:")
print(df.Target.value_counts())

df.to_csv(S3_OUTPUT, index=False) 

print(f"Archivo final guardado en: {S3_OUTPUT}")
print(f"Filas finales: {len(df)}")


Iniciando clasificación de 1000 descripciones en paralelo...
ThrottlingException detectada. Reintentando en 1.22 segundos...
ThrottlingException detectada. Reintentando en 1.36 segundos...
ThrottlingException detectada. Reintentando en 1.46 segundos...
ThrottlingException detectada. Reintentando en 1.95 segundos...
ThrottlingException detectada. Reintentando en 1.07 segundos...
ThrottlingException detectada. Reintentando en 1.99 segundos...
ThrottlingException detectada. Reintentando en 1.18 segundos...
ThrottlingException detectada. Reintentando en 2.61 segundos...
ThrottlingException detectada. Reintentando en 1.75 segundos...
ThrottlingException detectada. Reintentando en 1.85 segundos...
ThrottlingException detectada. Reintentando en 2.19 segundos...
ThrottlingException detectada. Reintentando en 2.65 segundos...
ThrottlingException detectada. Reintentando en 2.58 segundos...
ThrottlingException detectada. Reintentando en 1.91 segundos...
ThrottlingException detectada. Reintentando

In [5]:
df

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Description,Target
0,67,male,2,own,,little,1169,6,radio/TV,"The 67-year-old male customer, employed and re...",good risk
1,22,female,2,own,little,moderate,5951,48,radio/TV,The credit risk profile for this 22-year-old f...,bad risk
2,49,male,1,own,little,,2096,12,education,The credit risk profile for this 49-year-old m...,bad risk
3,45,male,2,free,little,little,7882,42,furniture/equipment,"The 45-year-old male customer, employed in a s...",bad risk
4,53,male,2,free,little,little,4870,24,car,"The 53-year-old skilled male, with limited sav...",bad risk
...,...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,,1736,12,furniture/equipment,The credit risk profile for this 31-year-old f...,bad risk
996,40,male,3,own,little,little,3857,30,car,The credit risk profile for this 40-year-old m...,bad risk
997,38,male,2,own,little,,804,12,radio/TV,The credit risk profile for this 38-year-old s...,good risk
998,23,male,2,free,little,little,1845,45,radio/TV,The credit risk profile for this 23-year-old m...,good risk
