In [81]:
import os
import csv
import re

def replace_sep(match, separator):
    return match.group(0).replace(separator, '_').replace('"', '').replace("'", '').replace(' ', '')

def modify_line(line, separator): 
    #print("Input line:", line) 
    return re.sub(r'"[^"]+"|\'[^\']+\'', lambda match: replace_sep(match, separator), line)

def clean_it(input_directory, output_directory, separator):    
    if os.path.isdir(input_directory):
        print(f"Cleaning INPUT: {input_directory}")
        print(f"Cleaning OUTPUT: {output_directory}")

        # Create output directory if it doesn't exist
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)

        for filename in os.listdir(input_directory):
            if filename.endswith(".csv"):
                #print(f"Filename: {filename}")
                #if not (filename.lower()=="TCGA_GBM_LGG_Mutations_all.csv".lower()):
                #    continue
                
                input_filename = os.path.join(input_directory, filename)
                output_filename = os.path.join(output_directory, filename)

                # Read the CSV file and modify the header
                with open(input_filename, 'r', newline='', encoding='utf-8') as infile, \
                     open(output_filename, 'w', newline='', encoding='utf-8') as outfile:

                    reader = csv.reader(infile)
                    writer = csv.writer(outfile)
                    
                    header = next(reader)
                    modified_header = modify_line(header[0], separator)
                    writer.writerow([modified_header])
                        
                    for row in reader:
                        #print(row)
                        if (len(row)>0):
                            modified_row = modify_line(row[0], separator)
                            writer.writerow([modified_row])

def change_sep(input_directory, old_sep, new_sep):
    if os.path.isdir(input_directory):
        for filename in os.listdir(input_directory):
            if filename.endswith(".csv"):
                readFile = open(f'{input_directory}\\{filename}', "r")
                data = readFile.read()
                data = data.replace(old_sep, new_sep)
                writeFile = open(f'{input_directory}\\{filename}', "w")
                writeFile.write(data)                
                        
def rename_it(input_directory):    
    if os.path.isdir(input_directory):
        print(f"Renaming INPUT: {input_directory}")

        for filename in os.listdir(input_directory):
            if filename.endswith(".problematic"):
                old = f"{input_directory}\\{filename}"
                new = f"{input_directory}\\{os.path.splitext(filename)[0]}"
                os.rename(old, new)

In [45]:
col_name = "LGG,TCGA-LGG,TCGA-DU-8164,Male,51 years 108 days,'Oligoden,droglioma, NOS',white,MUTATED"
print(modify_column_name(col_name, ','))

LGG,TCGA-LGG,TCGA-DU-8164,Male,51 years 108 days,Oligoden_droglioma_NOS,white,MUTATED


In [84]:
input_dir = 'data\\ics_uci\\all\\cleaned\\dataset'
change_sep(input_dir, ',', '|||')
change_sep(input_dir, ';', ',')
change_sep(input_dir, '|||', ';')

In [4]:
#input_dir = 'data\\romulo\\data-lakes\\clustering'
input_dir = 'data\\domain_net\\table_union_search'
if os.path.isdir(input_dir):
    for _dir in os.listdir(input_dir):
        in_dir = f'{input_dir}\\{_dir}'        
        rename_it(in_dir)

Renaming INPUT: data\domain_net\table_union_search\csvfiles
Renaming INPUT: data\domain_net\table_union_search\csvfiles_no_homographs


In [73]:
#input_dir = 'data\\romulo\\data-lakes\\clustering'
#output_dir = 'data\\romulo\\data-lakes\\cleaned'

#input_dir = 'data\\domain_net\\table_union_search'
#output_dir = 'data\\domain_net\\table_union_search\\cleaned'

input_dir = 'data\\ics_uci\\all'
output_dir = 'data\\ics_uci\\all\\cleaned'
separator = ';'

if os.path.isdir(input_dir):
    for _dir in os.listdir(input_dir):
        in_dir = f'{input_dir}\\{_dir}'
        out_dir = f'{output_dir}\\{_dir}'
        clean_it(in_dir, out_dir, separator)

Cleaning INPUT: data\ics_uci\all\dataset
Cleaning OUTPUT: data\ics_uci\all\cleaned\dataset


In [49]:
import re

def replace_semicolons(match, separator):
    return match.group(0).replace(separator, '_').replace('"', '').replace("'", '').replace(' ', '')

input_str = 'LGG;TCGA-LGG;TCGA-DU-8164;Male;51 years 108 days;"Oligodendroglioma; NOS";white;MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;MUTATED;NOT_MUTATED;NOT_MUTATED;MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED'
separator = ';'

output_str = re.sub(r'"[^"]+"|\'[^\']+\'', lambda match: replace_semicolons(match, separator), input_str)
print(output_str)

LGG;TCGA-LGG;TCGA-DU-8164;Male;51 years 108 days;Oligodendroglioma_NOS;white;MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;MUTATED;NOT_MUTATED;NOT_MUTATED;MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED;NOT_MUTATED
