In [6]:
import numpy as np
import pandas as pd
import os

In [7]:
input_directory = 'cure/'

output_directory = 'finalclean/'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [8]:
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        #Read each CSV file
        file_path = os.path.join(input_directory, filename)
        data = pd.read_csv(file_path)

        #Sort data by 'smiles' and 'weight' in descending order (higher weight is better)
        sorted_data = data.sort_values(by=['smiles', 'weight'], ascending=[True,False])

        #Remove duplicates , keeping the first entry after sorting, which is the one with higher weight
        best_values_data = sorted_data.drop_duplicates(subset='smiles', keep='first')

        #Sort the cleaned data to the output directory with the same filename
        output_file_path = os.path.join(output_directory, filename)
        best_values_data.to_csv(output_file_path, index=False)

print('All done :)')

        


All done :)


In [25]:
print('Cured data (before cleaning):')
path = 'C:\\Users\\sbnpa\\SolCuration\\cure'
for filename in os.listdir(path):
    if filename.endswith('.csv'):
        full_path = os.path.join(path, filename)  # Create the full path to the file
        with open(full_path, 'r', encoding="latin-1") as fileObj:
            # -1 to exclude the header
            print("Rows Counted {} in the csv {}:".format(len(fileObj.readlines()) - 1, filename))

print('Cleaned data (after cleaning):')
path = 'C:\\Users\\sbnpa\\SolCuration\\finalclean'
for filename in os.listdir(path):
    full_path = os.path.join(path, filename)  # Create the full path to the file
    with open(full_path, 'r', encoding="latin-1") as fileObj:
        # -1 to exclude the header
        print("Rows Counted {} in the csv {}:".format(len(fileObj.readlines()) - 1, filename))


Cured data (before cleaning):
Rows Counted 9061 in the csv aqsol_cure.csv:
Rows Counted 1354 in the csv aqua_cure.csv:
Rows Counted 28675 in the csv chembl_cure.csv:
Rows Counted 1157 in the csv esol_cure.csv:
Rows Counted 81935 in the csv kinect_cure.csv:
Rows Counted 3766 in the csv ochem_cure.csv:
Rows Counted 2001 in the csv phys_cure.csv:
Cleaned data (after cleaning):
Rows Counted 8701 in the csv aqsol_cure.csv:
Rows Counted 1301 in the csv aqua_cure.csv:
Rows Counted 26377 in the csv chembl_cure.csv:
Rows Counted 1110 in the csv esol_cure.csv:
Rows Counted 81891 in the csv kinect_cure.csv:
Rows Counted 3665 in the csv ochem_cure.csv:
Rows Counted 2001 in the csv phys_cure.csv:


In [30]:
#join all csv files

# List to hold dataframes
df_list = []

for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        #for all csv files in \cure, append them to a list
        full_path = os.path.join(input_directory, filename)
        df = pd.read_csv(full_path, index_col=None, header=0)
        df_list.append(df)

        # Concatenate all data into one DataFrame
        combined_csv = pd.concat(df_list, axis=0, ignore_index=True)

# Save the concatenated DataFrame to a new CSV file
combined_csv.to_csv("combined_data.csv", index=False)
print('I combined all the \cure data into one file :))')

I combined all the \cure data into one file :))


In [34]:
#Read combined CSV file
filename = 'combined_data.csv'
data = pd.read_csv(filename)

#Sort data by 'smiles' and 'weight' in descending order (higher weight is better)
sorted_data = data.sort_values(by=['smiles', 'weight'], ascending=[True,False])

#Remove duplicates , keeping the first entry after sorting, which is the one with higher weight
best_values_data = sorted_data.drop_duplicates(subset='smiles', keep='first')

#Sort the cleaned data to the output directory with the same filename
best_values_data.to_csv("combined_cleaned_data.csv", index=False)

print('Finiiiished :)')

Finiiiished :)


In [39]:
print('Cured data (before cleaning):')

filename = 'combined_data.csv'

with open(filename, 'r', encoding="latin-1") as fileObj:
    lines = fileObj.readlines()  # Read once and use multiple times
    a = len(lines) - 1  # -1 to exclude the header
    print("Rows Counted {} in the csv {}:".format(a, filename))

print('Cleaned data (after cleaning):')

filename = 'combined_cleaned_data.csv'

with open(filename, 'r', encoding="latin-1") as fileObj:
    lines = fileObj.readlines()  # Read once and use multiple times
    b = len(lines) - 1  # -1 to exclude the header
    print("Rows Counted {} in the csv {}:".format(b, filename))

print(a - b)

Cured data (before cleaning):
Rows Counted 127949 in the csv combined_data.csv:
Cleaned data (after cleaning):
Rows Counted 115039 in the csv combined_cleaned_data.csv:
12910
