In [1]:
import requests
import pandas as pd

In [2]:
df_train = pd.read_csv("../raw/political_ideologies/train.csv")
df_val = pd.read_csv("../raw/political_ideologies/validation.csv")
df_test = pd.read_csv("../raw/political_ideologies/test.csv")

# Junto todos los dataframes

df_political_ideology = pd.concat([df_train, df_val, df_test], axis=0)  



In [3]:
df_political_ideology

Unnamed: 0,statement,label,issue_type,__index_level_0__
0,"Climate change, and the escalating environment...",1,1,465
1,I believe in the foundational importance of th...,0,2,1191
2,I firmly believe that the principle of separat...,1,6,2440
3,I firmly believe in the separation of church a...,1,6,2406
4,I firmly believe in the power of free markets ...,0,0,1903
...,...,...,...,...
315,I firmly believe in the importance of creating...,1,2,2689
316,"In regards to immigration, I believe it's cruc...",0,5,3101
317,The issue of energy independence remains cruci...,0,3,1341
318,I firmly believe that our nation should uphold...,0,5,3110


In [7]:
len(df_political_ideology)

3200

In [4]:
# Indexes for 
# Economic Freedom: 0,1,3,4
# Social Freedom: 2,5,6,7
indexes_economic_freedom = [0, 3, 4]
indexes_social_freedom = [2, 5, 6, 7]

In [5]:
df_political_ideology[0:1]

Unnamed: 0,statement,label,issue_type,__index_level_0__
0,"Climate change, and the escalating environment...",1,1,465


In [6]:
# Función para asignar puntuación basada en el índice y el label
def asignar_puntuacion(row, indices):
    if row["issue_type"] in indices:
        if row["label"] == 1:
            return 1
        else:
            return -1
    else:
        return 0

In [7]:
# Aplicar la función a cada fila y crear nuevas columnas
df_political_ideology['libertad_economica_score'] = df_political_ideology.apply(lambda row: asignar_puntuacion(row, indexes_economic_freedom), axis=1)
df_political_ideology['libertad_personal_score'] = df_political_ideology.apply(lambda row: asignar_puntuacion(row, indexes_social_freedom), axis=1)


In [8]:
df_political_ideology

Unnamed: 0,statement,label,issue_type,__index_level_0__,libertad_economica_score,libertad_personal_score
0,"Climate change, and the escalating environment...",1,1,465,0,0
1,I believe in the foundational importance of th...,0,2,1191,0,-1
2,I firmly believe that the principle of separat...,1,6,2440,0,1
3,I firmly believe in the separation of church a...,1,6,2406,0,1
4,I firmly believe in the power of free markets ...,0,0,1903,-1,0
...,...,...,...,...,...,...
315,I firmly believe in the importance of creating...,1,2,2689,0,1
316,"In regards to immigration, I believe it's cruc...",0,5,3101,0,-1
317,The issue of energy independence remains cruci...,0,3,1341,-1,0
318,I firmly believe that our nation should uphold...,0,5,3110,0,-1


In [9]:
df_political_ideology.iloc[1, 0]

'I believe in the foundational importance of the nuclear family structure in society; it has historically been the bedrock upon which stable and prosperous communities are built. On the issue of gender, I value the inherent differences between men and women, with each sex bringing unique strengths to the table, which should be recognized and respected, rather than erased or homogenized.'

In [10]:
df_political_ideology_proc = df_political_ideology[['statement', 'libertad_economica_score', 'libertad_personal_score']]


In [14]:
df_political_ideology_proc.to_csv("pre/poli_idio.csv")

In [15]:
from transformers import MarianMTModel, MarianTokenizer


In [16]:
# Cargar el modelo y el tokenizador para inglés a español
model_name = 'Helsinki-NLP/opus-mt-en-es'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)



In [19]:
# Oración de prueba
sentence = "This is a cat."

# Traducir la oración
inputs = tokenizer([sentence], return_tensors="pt", padding=True)
translated = model.generate(**inputs)
translated_sentence = tokenizer.decode(translated[0], skip_special_tokens=True)

In [20]:
translated_sentence

'Esto es un gato.'

In [21]:
def translate_batch(batch):
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**inputs)
    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

In [25]:
import os
output_dir = 'traduccion_text_1'
os.makedirs(output_dir, exist_ok=True)
# Función para guardar cada batch traducido
def save_translation(batch_number, translated_batch):
    output_file = os.path.join(output_dir, f"translated_batch_{batch_number}.csv")
    df_translated = pd.DataFrame(translated_batch, columns=['translated_statement'])
    df_translated.to_csv(output_file, index=False)
    print(f"Batch {batch_number} guardado en {output_file}")

In [33]:
# Traducir la columna 'statement' por lotes para no sobrecargar la memoria
# Preparar batches
batch_size = 128
batches = [(i//batch_size, df_political_ideology_proc['statement'][i:i+batch_size].tolist()) for i in range(0, len(df_political_ideology_proc), batch_size)]



In [34]:
# Usar concurrent.futures para paralelizar el procesamiento
def process_batch(batch_data):
    batch_number, batch = batch_data
    translated_batch = translate_batch(batch)
    save_translation(batch_number, translated_batch)


In [1]:

for batch_data in batches:
    batch_number, batch = batch_data
    translated_batch = translate_batch(batch)
    save_translation(batch_number, translated_batch)

# Agregar la columna traducida al DataFrame


NameError: name 'batches' is not defined

In [43]:
translated_statements

['El cambio climático, y la creciente degradación ambiental que presenciamos a diario, es un tema urgente que requiere atención inmediata y esfuerzo colectivo.Las fuentes de energía renovables ofrecen una alternativa sostenible y respetuosa con el medio ambiente que puede reducir significativamente nuestra huella de carbono.También es crucial invertir y aplicar políticas que alienten el reciclaje, la conservación y las prácticas sostenibles.',
 'Creo en la importancia fundamental de la estructura familiar nuclear en la sociedad; históricamente ha sido la base sobre la que se han construido comunidades estables y prósperas. En cuanto al género, valoro las diferencias inherentes entre hombres y mujeres, con cada sexo aportando fortalezas únicas a la mesa, que deben ser reconocidas y respetadas, en lugar de ser borradas o homogeneizadas.',
 'Creo firmemente que el principio de separación de la Iglesia y el Estado es una piedra angular de nuestra democracia, garantizando la libertad de los

In [None]:
df['statement_es'] = translated_statements