In [1]:
import pandas as pd
import numpy as np

def cil_process(df, df_ref):
    # Define values to drop 
    doc_type_values_to_drop = [np.nan, 'OTHER']

    # Drop rows 
    df = df[~df["document type"].isin(doc_type_values_to_drop) & df["document type"].notna()]
    df = df[~df["document-url"].isin(doc_type_values_to_drop) & df["document-url"].notna()]
    #df = df[~df["documentation-url"].isin(doc_type_values_to_drop) & df["documentation-url"].notna()]
    #df = df[df["documentation-url"].str.startswith("http", na=False)]
    
    # Drop rows where the value in any column is "No CIL"
    df = df[df["adopted-date"] != "No CIL"]

    # Strip `None` and "Cannot find a page", leaving cells blank
    df = df.replace([None, "Cannot find a page"], "")

    # Set `adopted-date` to blank if it contains "N/A" or is None
    df['adopted-date'] = df['adopted-date'].replace(["N/A", None], "")

    # Copy "On hold" or "In Discussion" to `notes` and set `adopted-date` to blank
    mask = df['adopted-date'].isin(['On hold', 'In Discussion'])
    df.loc[mask, 'notes'] = df.loc[mask, 'adopted-date']
    df.loc[mask, 'adopted-date'] = ""

    # Only select relevant columns from df_ref
    df_ref = df_ref[["local-authority-code", "official-name"]]

    # Extract the codes from both df0 and df1
    df['org_code'] = df['organisation']
    df_ref['org_code'] = df_ref['official-name']

    # Perform a left merge and replace organisation with extracted 3-letter codes
    df = pd.merge(df, df_ref, on='org_code', how='left')
    df['organisation'] = df['local-authority-code']

    # Prepend 'local-authority:' to each entry in the 'organisation' column
    df['organisation'] = "local-authority: " + df['organisation']

    # Drop reduntant columns
    df = df.drop(columns=['local-authority-code', 'official-name', 'org_code'])

    # Create and save CIL dataset
    cil_df = df[df["document type"]=="CIL"]
    cil_df.to_csv('cil_dataset.csv', index=False)

    # Create and save IFS dataset
    ifs_df = df[df["document type"]=="IFS"]
    ifs_df.to_csv('ifs_dataset.csv', index=False)


# Load dataset
df = pd.read_csv("CIL_schedule_documents - Sheet1.csv")
# Load the organisation reference data
df_ref = pd.read_csv("C:/Users/DanielGodden/Documents/planning_data/local_plan_data_collection/documents/uk_local_authorities_future.csv")

cil_process(df, df_ref)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ref['org_code'] = df_ref['official-name']
