### Eliminacion Registros Duplicados

In [1]:
import pandas as pd

def process_tsv(input_file, output_file):
    df = pd.read_csv(input_file, sep='\t', header=None, names=['textA', 'textB', 'type', 'score'])
    
    duplicates = df.duplicated()
    num_duplicates = duplicates.sum()
    print(f"Número de registros duplicados: {num_duplicates}")
    
    df_unique = df.drop_duplicates()
    df_unique.to_csv(output_file, sep='\t', index=False, header=False)
    print(f"Registros únicos guardados en '{output_file}'")

In [2]:
input_tsv = 'paraphrases.tsv'
output_tsv = 'paraphrases_no_dup.tsv'
process_tsv(input_tsv, output_tsv)

  df = pd.read_csv(input_file, sep='\t', header=None, names=['textA', 'textB', 'type', 'score'])


Número de registros duplicados: 41532
Registros únicos guardados en 'paraphrases_no_dup.tsv'


In [3]:
input_tsv = 'contradictions.tsv'
output_tsv = 'contradictions_no_dup.tsv'
process_tsv(input_tsv, output_tsv)

  df = pd.read_csv(input_file, sep='\t', header=None, names=['textA', 'textB', 'type', 'score'])


Número de registros duplicados: 84408
Registros únicos guardados en 'contradictions_no_dup.tsv'


### Revision valores nan

In [None]:
import pandas as pd

tsv_iterator = pd.read_csv("paraphrases_no_dup.tsv", sep="\t")
rows_with_nan = tsv_iterator[tsv_iterator.isnull().any(axis=1)]
print("Registros con al menos un valor NaN:")
print(rows_with_nan)


Registros con al menos un valor NaN:
                   Phrase Paraphrase/Contradiction               Type  \
862515      null and void                      NaN  ForwardEntailment   
1679279     null and void                      NaN  ForwardEntailment   
1712410     null and void                      NaN  ForwardEntailment   
2033918  be null and void                      NaN        Equivalence   

            Score  
862515   0.518725  
1679279  0.506134  
1712410  0.505806  
2033918  0.731919  


In [2]:
import pandas as pd

tsv_iterator = pd.read_csv("paraphrases_no_dup.tsv", sep="\t")
tsv_iterator_filled = tsv_iterator.fillna("Nan")
tsv_iterator_filled.to_csv("paraphrases_no_dup.tsv", sep="\t", index=False)
print("DataFrame con valores NaN reemplazados por 'Nan' y archivo actualizado.")


DataFrame con valores NaN reemplazados por 'Nan' y archivo actualizado.


In [7]:
import pandas as pd

tsv_iterator = pd.read_csv("contradictions_no_dup.tsv", sep="\t")
rows_with_nan = tsv_iterator[tsv_iterator.isnull().any(axis=1)]
print("Registros con al menos un valor NaN:")
print(rows_with_nan)


Registros con al menos un valor NaN:
Empty DataFrame
Columns: [Phrase, Paraphrase/Contradiction, Type, Score]
Index: []


In [None]:
import pandas as pd

tsv_iterator = pd.read_csv("contradictions_no_dup.tsv", sep="\t")
tsv_iterator_filled = tsv_iterator.fillna("Nan")
tsv_iterator_filled.to_csv("contradictions_no_dup.tsv", sep="\t", index=False)
print("DataFrame con valores NaN reemplazados por 'Nan' y archivo actualizado.")


### Traduccion del corpus sin repetidos exactos


In [4]:
!pip install google-cloud-translate




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
from google.cloud import translate

def translate_text(text):
    try:
        project_id="tesis-448119"
        client = translate.TranslationServiceClient()
        location = "global"
        parent = f"projects/{project_id}/locations/{location}"

        response = client.translate_text(
            request={
                "parent": parent,
                "contents": text,
                "mime_type": "text/plain",
                "source_language_code": "en-US",
                "target_language_code": "es",
            }
        )
        translated_texts = [translation.translated_text for translation in response.translations]
        return translated_texts
    except  Exception as f:
        print("error: ", {f})


#print(translate_text(['Hello', 'dog', 'bread', 'your', 'bee', 'epson', 'co-chairman', 'google']))
#print(translate_text(['is considered to be', 'be considered to be', ', and therefore we', ', it is necessary to', 'and i am sure that', ', the committee recommended that', 'to develop a', 'telecommunications sector', 'decided to set', ', are responsible', 'legal proceedings .', 'aims and objectives of the', 'aims and objectives of the', 'sufficiently precise', ''nan'', 'country to country', 'therefore concludes that', 'of the commission is', 'and boost the', 'in addition , there was', 'in addition , there is']))

In [7]:
import pandas as pd
from tqdm import tqdm
from typing import List

# Función para traducir en bloques de 50 filas
def translate_tsv_in_batches(tsv_file: str, output_file: str, batch_size: int = 100):
    try:
        # Leer el archivo TSV en modo iterativo
        tsv_iterator = pd.read_csv(tsv_file, sep="\t", chunksize=batch_size)

        # Crear un archivo de salida vacío
        with open(output_file, 'w', encoding='utf-8') as out_file:
            header_written = False

            # Obtener el número total de filas para la barra de progreso
            total_rows = sum(1 for _ in open(tsv_file, 'r')) - 1  # Restar 1 por el encabezado
            total_chunks = total_rows // batch_size + (1 if total_rows % batch_size > 0 else 0)

            # Crear la barra de progreso
            with tqdm(total=total_chunks, desc="Procesando bloques") as pbar:
                # Procesar cada bloque
                for chunk in tsv_iterator:
                    # Verificar si las columnas necesarias existen
                    if not {'Phrase', 'Paraphrase/Contradiction'}.issubset(chunk.columns):
                        raise ValueError("El archivo TSV debe contener las columnas 'Phrase' y 'Paraphrase/Contradiction'")

                    # Obtener textos de 'Phrase' y 'Paraphrase/Contradiction'
                    texts_to_translate = chunk['Phrase'].tolist() + chunk['Paraphrase/Contradiction'].tolist()
                    translated_texts = translate_text(texts_to_translate)

                    if(len(texts_to_translate) != len(translated_texts)):
                        print("¡El tamaño de los arrays no coincide!")
                        break
                    translated_textA = translated_texts[:len(chunk)]
                    translated_textB = translated_texts[len(chunk):]

                    chunk['Phrase'] = translated_textA
                    chunk['Paraphrase/Contradiction'] = translated_textB

                    # Escribir el chunk traducido en el archivo de salida
                    if not header_written:
                        chunk.to_csv(out_file, sep="\t", index=False, mode='w', header=True, lineterminator='\n')
                        header_written = True
                    else:
                        chunk.to_csv(out_file, sep="\t", index=False, mode='a', header=False, lineterminator='\n')

                    # Actualizar la barra de progreso
                    pbar.update(1)
                    
    except Exception as f:
        print("Error:", f)


In [6]:
translate_tsv_in_batches("paraphrases_no_dup.tsv", "paraphrases_translated.tsv")

Procesando bloques: 100%|██████████| 249/249 [02:00<00:00,  2.06it/s]


In [8]:
translate_tsv_in_batches("contradictions_no_dup.tsv", "contradictions_translated.tsv")

Procesando bloques: 100%|██████████| 463/463 [03:52<00:00,  1.99it/s]


### Eliminación de registros duplicados con distintos scores

In [22]:
import pandas as pd

def find_similar_records(input_file):
    df = pd.read_csv(input_file, sep='\t', header=None, names=['textA', 'textB', 'type', 'score'])
    
    grouped = df.groupby(['textA', 'textB', 'type'])['score'].nunique()
    conflicting_groups = grouped[grouped > 1]
    
    similar_records = df[df.set_index(['textA', 'textB', 'type']).index.isin(conflicting_groups.index)]
    print(len(similar_records))
    print(similar_records.head(5))

In [23]:
input_tsv = 'paraphrases_no_dup.tsv'
find_similar_records(input_tsv)

  df = pd.read_csv(input_file, sep='\t', header=None, names=['textA', 'textB', 'type', 'score'])


326748
            textA       textB         type               score
2  not-for-profit  non-profit  Equivalence  0.7606575999999999
4         muslims      muslim  Equivalence           0.6259608
6       modelling    modeling  Equivalence            0.683473
7        modeling   modelling  Equivalence  0.6833613999999999
8          uganda     ugandan  Equivalence            0.600723


In [24]:
input_tsv = 'contradictions_no_dup.tsv'
find_similar_records(input_tsv)

  df = pd.read_csv(input_file, sep='\t', header=None, names=['textA', 'textB', 'type', 'score'])


4712125
           textA        textB           type               score
1        hi-tech     low-tech  contradiction           0.3013428
3  decentralised  centralized  contradiction  0.3109483999999999
4  decentralised   centralize  contradiction  0.3109483999999999
5  decentralised  concentrate  contradiction  0.3109483999999999
6        handled   handleless  contradiction  0.3120763999999999


In [26]:
import pandas as pd

def merge_similar_records(input_file, output_file):
    df = pd.read_csv(input_file, sep='\t', header=None, names=['textA', 'textB', 'type', 'score'])
    
    df['score'] = pd.to_numeric(df['score'], errors='coerce')
    merged = df.groupby(['textA', 'textB', 'type'], as_index=False).agg({'score': 'mean'})
    
    merged.to_csv(output_file, sep='\t', index=False, header=False)
    
    print(f"Archivo actualizado guardado en: {output_file}")
    print(f"Total de registros en el archivo actualizado: {len(merged)}")

In [14]:
input_tsv = 'paraphrases.tsv'
output_tsv = 'paraphrases_clean.tsv'
merge_similar_records(input_tsv, output_tsv)

  df = pd.read_csv(input_file, sep='\t', header=None, names=['textA', 'textB', 'type', 'score'])


Archivo actualizado guardado en: paraphrases_no_dup.tsv
Total de registros en el archivo actualizado: 3310055


In [15]:
input_tsv = 'contradictions.tsv'
output_tsv = 'contradictions_clean.tsv'
merge_similar_records(input_tsv, output_tsv)

  df = pd.read_csv(input_file, sep='\t', header=None, names=['textA', 'textB', 'type', 'score'])


Archivo actualizado guardado en: contradictions_clean.tsv
Total de registros en el archivo actualizado: 837520
