# JRC inspection sample selection script

Solution that selects inspection samples for the quality assessments (QA) of the Area Monitoring System (AMS) and GeoSpatial Application (GSA) 

Author: Mateusz Dobrychlop
Created on: April 4th 2024

Proof of concept version. For now, it does not take into account the 3% constraint.

## Imports

In [22]:
import pandas as pd
import datetime
import cosmetics_tools as cosmetics_tools

## Data extraction and output writing functions

Processing of two key input files:
 - the ranked "interventions" csv file, that contains rows corresponding to intervention types associated with parcels, which are associated with holdings
 - the "targets" csv file, listing target parcel count for each bucket corresponding to a single intervention type

 The intervention file is transformed into a DataFrame (filtering out some of its columns) and then used as the main data structure that is iterated over.
 The intervention DataFrame is sorted by the "ranking" column.

 The targets file is used to create a dictionary that represents buckets, that is gradually populated with info from the intervention DataFrame.

 This cell also defines a simple function that saves an output excel file.

In [40]:
def extract_interventions(path):
    """
    Extracts the interventions from the csv file and returns a dataframe with the columns:
    - parcel_id
    - holding_id
    - intervention_type_id
    - ranking
    """
    interventions_full_df = pd.read_csv(path)
    interventions_df = interventions_full_df[["gsa_par_id", "gsa_hol_id", "ua_grp_id", "ranking"]]
    interventions_df = interventions_df.sort_values(by="ranking")
    interventions_df = interventions_df.rename(columns={"ua_grp_id": "intervention_type_id", 
                                                        "gsa_hol_id": "holding_id", 
                                                        "gsa_par_id": "parcel_id"})
    
    # add a unique row id column that is combination of parcel_id, holding_id and intervention_type_id
    # this will be used to identify which rows were already added to buckets
    interventions_df['row_id'] = interventions_df['parcel_id'].astype(str) + interventions_df['holding_id'].astype(str) + interventions_df['intervention_type_id'].astype(str)


    return interventions_df

def extract_buckets(path):
    """
    Extracts the targets from the csv file and returns a dictionary with the keys being the intervention_type_id
    and the values being a dictionary with the keys:
    - target: the target number of parcels
    - parcels: a list of dictionaries with the keys:
        - parcel_id
        - holding_id
        - ranking
    """
    targets_full_df = pd.read_csv(path)
    targets_df = targets_full_df[["ua_grp_id", "target1"]]
    targets = targets_df.set_index('ua_grp_id').T.to_dict('records')[0]
    buckets = {}
    for id, target in targets.items():
        if target > 300:
            target = 300
        buckets[id] = {'target': target, 'parcels': []}
    return buckets

def generate_output(buckets):
    """
    Generates an output xlsx file with the following columns:
    - bucket_id
    - parcel_id
    - holding_id
    - ranking
    - target
    """
    output = []
    for bucket_id, bucket in buckets.items():
        for parcel in bucket['parcels']:
            output.append([bucket_id, parcel["parcel_id"], parcel["holding_id"], parcel["ranking"], bucket['target']])
    output_df = pd.DataFrame(output, columns=["bucket_id", "parcel_id", "holding_id", "ranking", "target"])

    filename = "output/output_" + datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S") + ".xlsx"
    output_df.to_excel(filename, index=False)

## Row allocation functions

Functions that distribute intervention DataFrame information into the buckets dictionary.

In [41]:
def buckets_full(buckets):
    """
    Returns True if all buckets are full, False otherwise
    """
    return all(len(bucket['parcels']) >= bucket['target'] for bucket in buckets.values())


def check_holding_group(holding_group, buckets, added_rows):
    """
    Checks the holding group for parcels that can be added to buckets.
    If possible, adds up to 3 parcels from the holding group to buckets.
    Adds added rows to the added_rows set, and the holding group to the checked_holdings set.
    """
    counter = 3
    for index, holding_row in holding_group.iterrows():
        if buckets_full(buckets) or counter == 0:
            break
        for bucket_id, bucket in buckets.items():
            if holding_row["intervention_type_id"] == bucket_id and len(bucket['parcels']) < bucket['target'] and holding_row["row_id"] not in added_rows:
                bucket['parcels'].append({"parcel_id": holding_row["parcel_id"],
                                          "holding_id": holding_row["holding_id"],
                                          "ranking": holding_row["ranking"],
                                          })
                added_rows.add(holding_row["row_id"])
                counter -= 1

    return buckets


def check_individual_row(row, buckets, added_rows):
    """
    Checks an individual row for a parcel that can be added to buckets.
    This is only done outside of the holding group check, for parcels / rows that are part of a holding
    that has already been checked.
    """
    if row["row_id"] not in added_rows:
        for bucket_id, bucket in buckets.items():
            if row["intervention_type_id"] == bucket_id and len(bucket['parcels']) < bucket['target']:
                bucket['parcels'].append({"parcel_id": row["parcel_id"],
                                        "holding_id": row["holding_id"],
                                        "ranking": row["ranking"],
                                        })
                added_rows.add(row["row_id"])
                
    return buckets


def iterate_over_interventions(interventions_df, buckets):
    """
    Main loop of the script.
    Iterates over the rows in the interventions dataframe and adds parcels to the buckets.
    """

    print("Buckets: (\033[92mgreen\033[0m = full, \033[93myellow\033[0m = still looking for parcels)")
    checked_holdings = set()
    added_rows = set()
    for index, row in interventions_df.iterrows():
        if buckets_full(buckets):
            break

        if row["holding_id"] not in checked_holdings:
            checked_holdings.add(row["holding_id"])
            holding_group = interventions_df[interventions_df["holding_id"] == row["holding_id"]]
            buckets = check_holding_group(holding_group, buckets, added_rows)
        else:
            buckets = check_individual_row(row, buckets, added_rows)

        cosmetics_tools.print_progress(buckets)

    return buckets

## Execute solution

In [42]:
interventions_path = "input/MT_ua_grp_tiles.csv"
targets_path = "input/MT_view_target_sample_size.csv"

interventions_df = extract_interventions(interventions_path)
buckets = extract_buckets(targets_path)

iterate_over_interventions(interventions_df, buckets)    

# Indicate the reason why the run ended
if buckets_full(buckets):
    print("\nAll buckets full!")
else:
    print("\nSome buckets not full!")

generate_output(buckets)   
print("\nOutput file generated.")

Buckets: ([92mgreen[0m = full, [93myellow[0m = still looking for parcels)
[92m1: 70/70[0m | [92m2: 300/300[0m | [92m3: 1/1[0m | [92m4: 24/24[0m | [92m5: 0/0[0m | [92m6: 4/4[0m | [92m7: 0/0[0m | [92m8: 250/250[0m | [92m9: 2/2[0m | [92m10: 50/50[0m | [92m11: 0/0[0m | [92m12: 6/6[0m | [92m13: 20/20[0m | [92m15: 8/8[0m | [92m16: 140/140[0m | [92m17: 0/0[0m | [92m18: 3/3[0m | [92m19: 0/0[0m | [92m20: 14/14[0m | [92m21: 300/300[0m | [92m22: 300/300[0m | 
All buckets full!

Output file generated.
