In [None]:
import pandas as pd
import numpy as np
from fuzzywuzzy import process


# Load your CSV file into a pandas DataFrame
csv_file_path = 'arc.csv'
data = pd.read_csv(csv_file_path)

# Define the acceptable values
resource_class_values = ['Collections','Datasets','Imagery','Maps','Web services','Websites','Other']
access_rights_values = ['Public', 'Restricted']
# 
# Rule 1: 'Resource Class'
def check_resource_class_values(x):
    if any(item not in resource_class_values for item in str(x).split('|')):
        print(f"'Resource Class' Error: {x} is invalid!")
        return True
    return False

data['resource_class_errors'] = data['Resource Class'].apply(check_resource_class_values)

# Rule 2: 'Date Range'
def check_date_range(dates):
    for date in str(dates).split('|'):
        years = date.split('-')
        if len(years) != 2 or not years[0].isdigit() or not years[1].isdigit() or int(years[0]) > int(years[1]):
            print(f"'Date Range' Error: {date} is invalid!") 
            return True
    return False

data['date_range_errors'] = data['Date Range'].apply(check_date_range)

# Rule 3: 'Access Rights'
def check_access_rights(x):
    if str(x) not in access_rights_values:
        print(f"'Access Rights' Error: {x} is invalid!")
        return True
    return False

data['access_rights_errors'] = data['Access Rights'].apply(check_access_rights)

def check_bounding_box(coords):
    try:
        coords = [float(x) for x in str(coords).split(',')]
    except ValueError:
        print(f"'Bounding Box' Error: {coords} contains non-numeric values!")
        return True

    if len(coords) != 4: 
        print(f"'Bounding Box' Error: {coords} is not formed by exactly four coordinates!")
        return True
    if abs(coords[0] - coords[2]) < 0.0001 or abs(coords[1] - coords[3]) < 0.0001:
        print(f"'Bounding Box' Error: {coords} forms a point or a line!")
        return True
    if any(abs(coord) > 180 for coord in coords):
        print(f"'Bounding Box' Error: {coords} --- one or more coordinate exceeds 180. It may be in UTM instead of degrees!")
        return True
    return False

data['bounding_box_errors'] = data['Bounding Box'].apply(check_bounding_box)

# New rule: 'Format' and 'Download'
def check_format_download(format_value, download_value):
    if pd.notnull(download_value) and pd.isnull(format_value):
        print(f"'Format' Error: 'Download' value exists ({download_value}) but 'Format' value is missing.")
        return True
    return False

data['format_download_errors'] = data.apply(lambda row: check_format_download(row['Format'], row['Download']), axis=1)

# Reporting
errors = data[['resource_class_errors', 'date_range_errors', 'access_rights_errors', 'bounding_box_errors', 'format_download_errors']]

# DataFrame with rows containing errors
errors_data = data[(data[['resource_class_errors', 'date_range_errors', 'access_rights_errors', 'bounding_box_errors', 'format_download_errors']] == 'Failed').any(axis=1)]


data['resource_class_errors'] = data['Resource Class'].apply(check_resource_class_values).replace({True: 'Failed', False: np.nan})
data['date_range_errors'] = data['Date Range'].apply(check_date_range).replace({True: 'Failed', False: np.nan})
data['access_rights_errors'] = data['Access Rights'].apply(check_access_rights).replace({True: 'Failed', False: np.nan})
data['bounding_box_errors'] = data['Bounding Box'].apply(check_bounding_box).replace({True: 'Failed', False: np.nan})
data['format_download_errors'] = data.apply(lambda row: check_format_download(row['Format'], row['Download']), axis=1).replace({True: 'Failed', False: np.nan})


# Write the DataFrame with rows containing errors to new CSV
errors_file_path = "metadata_errors_2.csv"
errors_data.to_csv(errors_file_path, index=False)

# Write DataFrame to new CSV
output_file_path = "validated_metadata_2.csv"
data.to_csv(output_file_path, index=False)

print(f"Data containing errors written to {errors_file_path}")

print(f"Data written to {output_file_path}")
