In [1]:
import numpy as np
import pandas as pd

In [2]:
political_bias_data = pd.read_csv("../../raw/political_bias.csv")
political_bias_data.head()

Unnamed: 0,text,label
0,The government should reduce taxes significant...,0
1,Cutting welfare programs is necessary to encou...,0
2,A strong military is essential for national se...,0
3,Deregulation is crucial to promote free market...,0
4,Traditional family values should be upheld to ...,0


In [3]:
political_bias_data = political_bias_data.sample(frac=1, random_state=42)

In [4]:
political_bias_data.head()

Unnamed: 0,text,label
627,Moderate wage policies can raise incomes witho...,3
271,Balanced education funding should support both...,1
290,Economic development should be pursued while e...,2
63,Tax credits for research and development can d...,1
302,Public investment should balance economic grow...,2


In [5]:
from transformers import MarianMTModel, MarianTokenizer


In [6]:
model_name = 'Helsinki-NLP/opus-mt-en-es'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)#.to('cuda')



In [7]:
# Oración de prueba
sentence = "This is a rusty spotted cat."

# Traducir la oración
inputs = tokenizer([sentence], return_tensors="pt", padding=True)
translated = model.generate(**inputs)
translated_sentence = tokenizer.decode(translated[0], skip_special_tokens=True)

In [8]:
print(translated_sentence)

Este es un gato manchado oxidado.


In [9]:
def translate_batch(batch):
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**inputs)
    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

In [10]:
def translate_batch(batch):
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**inputs)
    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

In [11]:
def save_translation(batch_number, translated_batch):
    output_file = "traduccion_bias/translated_batch_"+ str(batch_number)+".csv"
    df_translated = pd.DataFrame(translated_batch, columns=['translated_statement'])
    df_translated.to_csv(output_file, index=False)
    print(f"Batch {batch_number} guardado en {output_file}")

In [12]:
# Traducir la columna 'statement' por lotes para no sobrecargar la memoria
# Preparar batches
batch_size = 128
batches = [(i//batch_size, political_bias_data['text'][i:i+batch_size].tolist()) for i in range(0, len(political_bias_data), batch_size)]

In [14]:
def process_batch(batch_data):
    batch_number, batch = batch_data
    translated_batch = translate_batch(batch)
    save_translation(batch_number, translated_batch)


for batch_data in batches:
    batch_number, batch = batch_data
    if batch_number <= 0:
        continue
    print("Procesando el batch: ", batch_number)
    translated_batch = translate_batch(batch)
    save_translation(batch_number, translated_batch)
    

Procesando el batch:  1
Batch 1 guardado en traduccion_bias/translated_batch_1.csv
Procesando el batch:  2
Batch 2 guardado en traduccion_bias/translated_batch_2.csv
Procesando el batch:  3
Batch 3 guardado en traduccion_bias/translated_batch_3.csv
Procesando el batch:  4
Batch 4 guardado en traduccion_bias/translated_batch_4.csv
Procesando el batch:  5
Batch 5 guardado en traduccion_bias/translated_batch_5.csv
