In [3]:
from google.colab import files
uploaded = files.upload()

Saving CO_U and CO_C Wells with Ra data DNR merged -S&G in MCOAS.xlsx to CO_U and CO_C Wells with Ra data DNR merged -S&G in MCOAS.xlsx


In [4]:
import pandas as pd
from datetime import datetime


def read_excel_file(file_path, sheet_name):
    df = pd.read_excel(file_path, sheet_name=sheet_name, parse_dates=['Year'])
    df['Year'] = df['Sample Date'].dt.year
    print(df.columns)  # Print column names
    return df

def datetime_columns_to_str(df):
    for column in df.columns:
        if pd.api.types.is_datetime64_any_dtype(df[column]):
            df[column] = df[column].astype(str)
    return df

def wells_sampled_once_per_year(df):
    well_ids = df["WI_UNIQUE_"]
    concentrations = df["Measured A"]
    years = df["Sample Date"]

    samples_once_per_year = {}
    concentration_dict = {}

    for well_id, conc, year in zip(well_ids, concentrations, years):
        if (well_id, year) in samples_once_per_year:
            samples_once_per_year[(well_id, year)] += 1
        else:
            samples_once_per_year[(well_id, year)] = 1
            concentration_dict[(well_id, year)] = conc

    wells_sampled_once = [(well_id, year, concentration_dict[(well_id, year)])
                          for (well_id, year), count in samples_once_per_year.items() if count == 1]

    return wells_sampled_once

def wells_sampled_multiple_per_year(df):
    well_ids = df["WI_UNIQUE_"]
    concentrations = df["Measured A"]
    years = df["Sample Date"]

    samples_multiple_per_year = {}
    concentration_dict = {}

    for well_id, conc, year in zip(well_ids, concentrations, years):
        if (well_id, year) in samples_multiple_per_year:
            samples_multiple_per_year[(well_id, year)] += 1
            if concentration_dict[(well_id, year)] < conc:
                concentration_dict[(well_id, year)] = conc
        else:
            samples_multiple_per_year[(well_id, year)] = 1
            concentration_dict[(well_id, year)] = conc

    wells_sampled_multiple = [(well_id, year, concentration_dict[(well_id, year)])
                              for (well_id, year), count in samples_multiple_per_year.items() if count > 1]

    return wells_sampled_multiple

def combine_wells_sampled(df):
    all_wells = wells_sampled_once_per_year(df)
    wells_sampled_multiple = wells_sampled_multiple_per_year(df)
    combined_wells = all_wells + wells_sampled_multiple
    return combined_wells

def main():
    file_path = "CO_U and CO_C Wells with Ra data DNR merged -S&G in MCOAS.xlsx"
    sheet_name = "CO_C"

    df = read_excel_file(file_path, sheet_name)
    print(df)

    combined_wells = combine_wells_sampled(df)
    print(len(combined_wells))
    print(combined_wells)

    df_combined_wells = pd.DataFrame(combined_wells, columns=["WI_UNIQUE_", "Sample Date", "Measured A"])

    merged_df = pd.merge(df, df_combined_wells, how="left", on=["WI_UNIQUE_", "Sample Date"])
    merged_df = datetime_columns_to_str(merged_df)
    merged_df.to_excel("updated_wells_data_final2.xlsx", sheet_name=sheet_name, index=False)

if __name__ == "__main__":
    main()


ValueError: ignored