In [None]:
import pandas as pd
import numpy as np
from fuzzywuzzy import process

# Define the acceptable values
resource_class_values = ['Collections','Datasets','Imagery','Maps','Web services','Websites','Other']
access_rights_values = ['Public', 'Restricted']

# Load your CSV file into a pandas DataFrame
# Fill this in with the name of your CSV!!!!
csv_file_path = '20231017_scannedRecords.csv'
data = pd.read_csv(csv_file_path)

# Create a DataFrame to store cleaning log
cleaning_log = pd.DataFrame(columns=['ColumnName', 'OriginalValue', 'CleanedValue', 'CleaningAction'])

In [None]:
# Clean 'Resource Class'
def clean_resource_class(row):
    resource_class_string = row['Resource Class']
    original = resource_class_string
    if pd.isnull(resource_class_string):
        new_value = 'Datasets'
        global cleaning_log
        cleaning_log = pd.concat([cleaning_log, pd.DataFrame([{ 'ID': row['ID'], 'ColumnName': 'Resource Class', 'OriginalValue': original, 'CleanedValue': new_value, 'CleaningAction': 'Filled empty value with "Datasets"'}])], ignore_index=True)
        return new_value
    else:
        resource_classes = resource_class_string.split('|')
        new_resource_classes = []
        for class_value in resource_classes:
            new_value = process.extractOne(class_value.strip(), resource_class_values)[0]
            if new_value != class_value:
                cleaning_log = pd.concat([cleaning_log, pd.DataFrame([{ 'ID': row['ID'], 'ColumnName': 'Resource Class', 'OriginalValue': class_value, 'CleanedValue': new_value, 'CleaningAction': 'Fixed spelling'}])], ignore_index=True)
            new_resource_classes.append(new_value)
        return '|'.join(new_resource_classes)
data['Resource Class'] = data.apply(clean_resource_class, axis=1)

In [None]:
# Clean 'Date Range'
def clean_date_range(row):
    x = row['Date Range']
    original = x
    if pd.isnull(x) or x == '':
        return x  # returns the original value if it's empty or null
    else:
        date_ranges = str(x).split('|')
        for i in range(len(date_ranges)):
            years = date_ranges[i].split('-')
            if len(years) != 2 or not years[0].isdigit() or not years[1].isdigit() or int(years[0]) > int(years[1]):
                years = sorted(years)
                date_ranges[i] = '-'.join(years)
                x = '|'.join(date_ranges)
                global cleaning_log
                cleaning_log = pd.concat([cleaning_log, pd.DataFrame([{ 'ID': row['ID'], 'ColumnName': 'Date Range', 'OriginalValue': original, 'CleanedValue': x, 'CleaningAction': 'Corrected date order'}])], ignore_index=True)
        return x
data['Date Range'] = data.apply(clean_date_range, axis=1)

In [None]:
# Clean 'Access Rights'
def clean_access_rights(row):
    x = row['Access Rights']
    original = x
    if pd.isnull(x):
        x = 'Public'
        global cleaning_log
        cleaning_log = pd.concat([cleaning_log, pd.DataFrame([{ 'ID': row['ID'], 'ColumnName': 'Access Rights', 'OriginalValue': original, 'CleanedValue': x, 'CleaningAction': 'Filled empty value with "Public"'}])], ignore_index=True)
    else:
        new_value = process.extractOne(str(x), access_rights_values)[0]
        if new_value != x:
            x = new_value
            cleaning_log = pd.concat([cleaning_log, pd.DataFrame([{ 'ID': row['ID'], 'ColumnName': 'Access Rights', 'OriginalValue': original, 'CleanedValue': x, 'CleaningAction': 'Fixed spelling'}])], ignore_index=True)
    return x

data['Access Rights'] = data.apply(clean_access_rights, axis=1)

In [None]:
# Clean 'Format' based on 'Download' field
def clean_format(row):
    x = row['Format']
    original = x
    if pd.isnull(x) and pd.notnull(row['Download']):
        x = 'File'
        global cleaning_log
        cleaning_log = pd.concat([cleaning_log, pd.DataFrame([{ 'ID': row['ID'], 'ColumnName': 'Format', 'OriginalValue': original, 'CleanedValue': x, 'CleaningAction': 'Filled missing value with "File"'}])], ignore_index=True)
    return x

data['Format'] = data.apply(clean_format, axis=1)

In [None]:
data['Original Bounding Box'] = data.loc[:, 'Bounding Box']
# Function to round decimal places
def round_coordinates(row):
    x = row['Bounding Box']
    original = x
    if pd.isna(x):
        return x
    else:
        pairs = x.split(',')
        new_pairs = []
        for pair in pairs:
            coords = pair.split()
            new_coords = [str(round(float(coord), 2)) for coord in coords]
            new_pair = ' '.join(new_coords)
            global cleaning_log
            if new_pair != pair:
                cleaning_log = pd.concat([cleaning_log, pd.DataFrame([{'ID':row['ID'], 'ColumnName': 'Bounding Box', 'OriginalValue': pair, 'CleanedValue': new_pair, 'CleaningAction': 'Rounded to 2 decimal places'}])], ignore_index=True)
            new_pairs.append(new_pair)
        return ','.join(new_pairs)
    
data['Bounding Box'] = data.apply(round_coordinates, axis=1)

In [None]:
def clean_bounding_box(row):
    global cleaning_log

    original_coords = str(row['Bounding Box'])
    coords = original_coords.split(',')
    
    if original_coords == '' or original_coords == 'nan' or len(coords) != 4:
        return np.nan

    west, south, east, north = map(float, coords)

    if not all([-180 <= west <= 180, -180 <= east <= 180, -90 <= south <= 90, -90 <= north <= 90]):
        cleaning_log = pd.concat([cleaning_log, pd.DataFrame([{'ID':row['ID'], 'ColumnName': 'Bounding Box', 'OriginalValue': original_coords, 'CleanedValue': '', 'CleaningAction': 'Removed incorrect coordinates'}])], ignore_index=True)
        return ''

    east_modified = False
    north_modified = False

    if west == east:
        east += 0.001
        east_modified = True

    if south == north:
        north += 0.001
        north_modified = True
      
    new_coords = f"{west:.2f},{south:.2f},{f'{east:.3f}' if east_modified else f'{east:.2f}'},{f'{north:.3f}' if north_modified else f'{north:.2f}'}"

    original_coords_formatted = f"{float(coords[0]):.2f},{float(coords[1]):.2f},{float(coords[2]):.2f},{float(coords[3]):.2f}"
    
    if new_coords != original_coords_formatted:
        cleaning_log = pd.concat([cleaning_log, pd.DataFrame([{'ID':row['ID'], 'ColumnName': 'Bounding Box', 'OriginalValue': row['Bounding Box'], 'CleanedValue': new_coords, 'CleaningAction': 'Corrected line/point to a box'}])], ignore_index=True)

    return new_coords

data['Bounding Box'] = data.apply(clean_bounding_box, axis=1)


## After cleaning

In [None]:
# Create a set with all cleaned IDs.
cleaned_ids = set(cleaning_log['ID'])

cleaned_file_path = "cleaned_" + csv_file_path

# Create a new column 'Cleaned'
data['Cleaned'] = data['ID'].apply(lambda x: 'Yes' if x in cleaned_ids else 'No')

# Write the cleaned data to a CSV file
data.to_csv(cleaned_file_path, index=False)

cleaning_log.to_csv("cleaning_log.csv", index=False)
