In [84]:
import pandas as pd
import csv

def detect_delimiter(file_path):
    delimiters = [',', ';', '\t', '|']
    
    with open(file_path, 'r') as file:
        header = next(file)
        
    for delimiter in delimiters:
        if header.count(delimiter) > 0:
            return delimiter
    
    raise ValueError("No se pudo determinar el delimitador o el archivo tiene un formato no soportado.")

def separate_valid_invalid_rows(file_path):
    delimiter = detect_delimiter(file_path)
    valid_rows = []
    invalid_rows = []
    
    with open(file_path, 'r') as file:
        header = next(file)  # Leer la cabecera
        valid_rows.append(header)  # La cabecera siempre es válida
        expected_columns = len(header.split(delimiter))  # Determinar el número de columnas esperadas
        
        for line in file:
            if len(line.split(delimiter)) == expected_columns:
                valid_rows.append(line)
            else:
                invalid_rows.append(line)
    
    valid_file_path = file_path.replace('.txt', '_valid.txt')
    invalid_file_path = file_path.replace('.txt', '_invalid.txt')
    
    with open(valid_file_path, 'w') as valid_file:
        valid_file.writelines(valid_rows)
    
    with open(invalid_file_path, 'w') as invalid_file:
        invalid_file.write(header)  # Escribir la cabecera en el archivo inválido
        invalid_file.writelines(invalid_rows)
    
    return valid_file_path, invalid_file_path

def combine_columns(line, expected_columns, delimiter=','):
    new_line = []
    temp = ''
    in_quotes = False
    quote_char = ''
    
    for element in line:
        if not in_quotes:
            if element.startswith('"') and element.endswith('"'):
                new_line.append(element)
            elif element.startswith('"') or element.startswith("'"):
                temp += element
                in_quotes = True
                quote_char = element[0]
            else:
                new_line.append(element)
        else:
            temp += delimiter + element
            if element.endswith(quote_char):
                new_line.append(temp)
                temp = ''
                in_quotes = False
    
    if temp:  # Si aún queda algo en temp, agregarlo como una columna
        new_line.append(temp)
    
    # Asegurarse de que la cantidad de columnas sea la esperada
    if len(new_line) < expected_columns:
        new_line += [''] * (expected_columns - len(new_line))
    elif len(new_line) > expected_columns:
        new_line = new_line[:expected_columns-1] + [delimiter.join(new_line[expected_columns-1:])]
    
    return new_line

def correct_invalid_rows(file_path):
    invalid_file_path = file_path.replace('.txt', '_invalid.txt')
    valid_file_path = file_path.replace('.txt', '_valid.txt')
    corrected_rows = []

    # Leer las filas inválidas
    with open(invalid_file_path, 'r') as invalid_file:
        reader = csv.reader(invalid_file)
        header = next(reader)  # Leer la cabecera
        corrected_rows.append(header)
        expected_columns = len(header)  # Número de columnas esperadas
        
        for line in reader:
            if len(line) < expected_columns:
                # Si faltan columnas, intentar combinar elementos usando comillas dobles o simples
                line = combine_columns(line, expected_columns, delimiter=',')
            corrected_rows.append(line)

    # Leer las filas válidas
    with open(valid_file_path, 'r') as valid_file:
        valid_rows = valid_file.readlines()

    # Asegurarse de que todas las filas tengan el número correcto de columnas
    combined_rows = valid_rows + [','.join(row) + '\n' for row in corrected_rows[1:]]
    combined_rows = [row.strip().split(',') for row in combined_rows]

    # Asegurarse de que todas las filas tengan el número correcto de columnas
    for i, row in enumerate(combined_rows):
        if len(row) < expected_columns:
            combined_rows[i] += [''] * (expected_columns - len(row))
        elif len(row) > expected_columns:
            combined_rows[i] = row[:expected_columns-1] + [','.join(row[expected_columns-1:])]

    # Crear un DataFrame a partir de las filas combinadas
    combined_data = pd.DataFrame(combined_rows[1:], columns=combined_rows[0])

    return combined_data

In [85]:
file_path = 'input/data.txt'
valid_file_path, invalid_file_path = separate_valid_invalid_rows(file_path)
df_data = correct_invalid_rows(file_path)

file_path = 'input/data1.txt'
valid_file_path, invalid_file_path = separate_valid_invalid_rows(file_path)
df_data1 = correct_invalid_rows(file_path)

file_path = 'input/data3.txt'
valid_file_path, invalid_file_path = separate_valid_invalid_rows(file_path)
df_data3 = correct_invalid_rows(file_path)

In [86]:
import re
# Función para estandarizar nombres de columnas y quitar caracteres especiales
def estandarizar_nombres_columnas(columnas):
    columnas = columnas.str.strip()                     # Quitar espacios en blanco al inicio y al final
    columnas = columnas.str.lower()                     # Convertir a minúsculas
    columnas = columnas.str.replace(' ', '_')           # Reemplazar espacios por guiones bajos
    columnas = columnas.to_series().apply(lambda x: re.sub(r'\W+', '', x))  # Quitar caracteres especiales
    return columnas

In [87]:
df_data.columns = estandarizar_nombres_columnas(df_data.columns)
df_data1.columns = estandarizar_nombres_columnas(df_data1.columns)
df_data3.columns = estandarizar_nombres_columnas(df_data3.columns)

In [88]:
df_data

Unnamed: 0,name,age,height_cm,weight_kg,salary_,city
0,"""Alice""",30,165.5,60.2,50000.0,"""New York"""
1,"""Bob""",25,175.0,75.0,60000.0,"""Los Angeles"""
2,"""Charlie""",32,170.5,,55000.0,"""Chicago"""
3,"""David""",26,180.0,85.5,59000.0,"""San Francisco"""
4,"""Eve""",Twenty-Five,160,58.0,52000.0,""" Bristol"""
5,"""Grace""",29,,65.0,51000.0,"""Huston"""
6,"""Heidi""",28,168.0,0.0,55000.0,"""Houston"""
7,"""Ivan""",34,185.0,95.0,68000.0,"""Miani"""
8,"""Jack""",27,172.5,70.5,54000.0,"""Boston"""
9,"""Katie""",31,160.0,55.5,52000.0,"""Seattle"""


In [89]:
df_data3

Unnamed: 0,name,sex,marital_status,career
0,"""Alice""",Female,Single,Data Scientist
1,"""Bob""",Male,Single,Software Engineer
2,"""Charlie""",Male,Married,Doctor
3,"""David""",Male,Single,Lawyer
4,"""Eve""",Female,Divorced,Artist
5,"""Frank""",Male,Married,Engineer
6,"""Grace""",Female,Single,Nurse
7,"""Heidi""",Female,Married,Teacher
8,"""Ivan""",Male,Married,Accountant
9,"""Jack""",Male,Single,IT Specialist


In [90]:
df_data1

Unnamed: 0,name,age,height_inches,weight_pounds,salary_,city
0,"""Amy""",30,65.5,132.3,50000,"""New York"""
1,"""Ben""",25,68.0,165.3,60000,"""Los Angeles"""
2,"""Charlie""",32,67.0,180.5,55000,"""Chicago"""
3,"""David""",26,70.0,188.7,59000,"""San Francisco"""
4,"""Ella""",28,61.0,126.0,52000,"""Bristol"""
5,"""Grace""",29,64.0,143.5,51000,"""Houston"""
6,"""Henry""",28,66.1,110.2,55000,"""Houston"""
7,"""Ivy""",34,72.0,209.0,68000,"""Miami"""
8,"""Jack""",27,68.0,155.5,54000,"""Boston"""
9,Frank,35,69.0,154.3,58000,"""Washington, D.C."""


In [91]:
df_data['name'] = df_data['name'].str.replace('"','')
df_data1['name'] = df_data1['name'].str.replace('"','')
df_data3['name'] = df_data3['name'].str.replace('"','')

In [92]:
df_data3

Unnamed: 0,name,sex,marital_status,career
0,Alice,Female,Single,Data Scientist
1,Bob,Male,Single,Software Engineer
2,Charlie,Male,Married,Doctor
3,David,Male,Single,Lawyer
4,Eve,Female,Divorced,Artist
5,Frank,Male,Married,Engineer
6,Grace,Female,Single,Nurse
7,Heidi,Female,Married,Teacher
8,Ivan,Male,Married,Accountant
9,Jack,Male,Single,IT Specialist


In [93]:
def fusionar_dataframe (df1, df2, columna):
    # Fusionar los DataFrames utilizando la columna 'name' en común
    df_merged = pd.merge(df1, df2, on=columna, how='outer', suffixes=('_df1', '_df2'))

    # Eliminar las columnas duplicadas, manteniendo la primera aparición
    # Mantener solo la primera aparición de las columnas con sufijo '_df1'
    df_merged = df_merged.loc[:, ~df_merged.columns.str.endswith('_df2')]

    # Renombrar las columnas para quitar los sufijos
    df_merged.columns = df_merged.columns.str.replace('_df1', '')

    return df_merged    

In [94]:
import numpy as np

df_merged = fusionar_dataframe(df_data,df_data1,'name')

df_merged = fusionar_dataframe(df_merged,df_data3,'name')

df_merged = df_merged.drop_duplicates()

df_merged.replace('',np.nan, inplace=True)

df_merged = df_merged.dropna(how='all')

df_merged 

Unnamed: 0,name,age,height_cm,weight_kg,salary_,city,height_inches,weight_pounds,sex,marital_status,career
1,Alice,30,165.5,60.2,50000.0,"""New York""",,,Female,Single,Data Scientist
2,Amy,,,,,,65.5,132.3,Female,Married,Sales Manager
3,Ben,,,,,,68.0,165.3,Male,Single,Marketing Manager
4,Bob,25,175.0,75.0,60000.0,"""Los Angeles""",,,Male,Single,Software Engineer
5,Charlie,32,170.5,,55000.0,"""Chicago""",67.0,180.5,Male,Married,Doctor
7,David,26,180.0,85.5,59000.0,"""San Francisco""",70.0,188.7,Male,Single,Lawyer
9,Ella,,,,,,61.0,126.0,Female,Divorced,Artist
10,Eve,Twenty-Five,160,58.0,52000.0,""" Bristol""",,,Female,Divorced,Artist
11,Frank,35,175.5,70.0,,"""Washington, D.C.""",69.0,154.3,Male,Married,Engineer
13,Grace,29,,65.0,51000.0,"""Huston""",64.0,143.5,Female,Single,Nurse


In [97]:
# Reemplazar las comillas de cada valor en todas las columnas que son del tipo str
df_merged = df_merged.map(lambda x: x.replace('"','') if isinstance(x, str) else x)
# Reemplazar los espacios al inicio y al fin de cada valor en todas las columnas
df_merged = df_merged.map(lambda x: x.strip() if isinstance(x, str) else x)

df_merged

Unnamed: 0,name,age,height_cm,weight_kg,salary_,city,height_inches,weight_pounds,sex,marital_status,career
1,Alice,30,165.5,60.2,50000.0,New York,,,Female,Single,Data Scientist
2,Amy,,,,,,65.5,132.3,Female,Married,Sales Manager
3,Ben,,,,,,68.0,165.3,Male,Single,Marketing Manager
4,Bob,25,175.0,75.0,60000.0,Los Angeles,,,Male,Single,Software Engineer
5,Charlie,32,170.5,,55000.0,Chicago,67.0,180.5,Male,Married,Doctor
7,David,26,180.0,85.5,59000.0,San Francisco,70.0,188.7,Male,Single,Lawyer
9,Ella,,,,,,61.0,126.0,Female,Divorced,Artist
10,Eve,Twenty-Five,160.0,58.0,52000.0,Bristol,,,Female,Divorced,Artist
11,Frank,35,175.5,70.0,,"Washington, D.C.",69.0,154.3,Male,Married,Engineer
13,Grace,29,,65.0,51000.0,Huston,64.0,143.5,Female,Single,Nurse


In [83]:
# Guardar el dataframe fusionado en un archivo Excel
df_merged.to_excel('output/data_merge.xlsx', index=False)