#### **Import Libraries and Define Functions**

In [None]:
import pandas as pd
import numpy as np
import os
from pydantic import BaseModel
import openai
from tqdm.notebook import tqdm
tqdm.pandas()

save_path = "..." # path to folder containing subfolders called Flagged Apps in Batches & Randomized Apps in Batches
openai.api_key = '...'

In [None]:
# create genAI prompt variable

clu_prompt = "AI Assistant Role: You assist a Planning Officer at the GLA. You specialize in inferring residential units from continued\
             (or lawful) planning applications’ descriptions. Note that changes proposed in these applications have already happened and\
             so your job is to infer the existing number of units and return them as the Total proposed units output. This is important\
             as the current use of these residential units are reported as recent completions. Follow these rules obtaining unit counts\
             from the descriptions:\
             1. Count only 'houses,' 'homes,' 'flats,' 'self-contained units,' or C3-use ‘dwellings’ unless specified in rules 2–4.\
             2. Count HMOs with <7 rooms (or beds or occupants) as 1 unit gained/lost; if ≥7, count HMO rooms gained/lost instead.\
             3. Count co-living, granny annexes (not to be confused with ancillary annexes), and older people’s apartments as residential units.\
             4. Do not count 'communal space', 'garages', 'habitable spaces' or ‘habitable rooms’ or 'outbuildings' as residential units.\
             5. Infer units only from explicit numbers like '5 flats' or ‘2 no. dwellinghouses’; assume 1 unit for phrases like 'a house'\
             or 'ground floor flat' unless otherwise specified.\
             6. Return -1 for vague terms (e.g., 'several dwellings', unquantified HMO) or unclear unit counts.\
             7. Do not infer unit counts from addresses; return -1 if units are unclear.\
             8. If the use from which the property is changing is provided, return this as the Total existing units value, otherwise set this\
             value as 0.\
             Outputs:\
             Generate:\
             1. Total proposed units,\
             2. Total existing units,\
             3. Proposed HMOs rooms (≥7 rooms),\
             4. Existing HMOs rooms (set to 0), and\
             5. A short comment (max. 150 characters) explaining ambiguities or rule applications.".strip()

prompt = "AI Assistant Role: You assist a Planning Officer at the GLA. You specialize in detecting applications that propose changes in residential\
          units. From these applications, infer the number of existing units (units before proposed changes) as well as proposed units (units after\
          proposed changes) from applications’ descriptions based on these rules:\
          1. Count only 'houses,' 'homes,' 'flats,' 'self-contained units,' or C3-use ‘dwellings’ as residential units unless otherwise specified in\
          rules 3–5.\
          2. Infer units only from explicit numbers like '5 flats' or ‘2 no. dwellinghouses’; assume 1 unit for phrases like 'a house' or 'ground\
          floor flat' unless otherwise specified.\
          3. Count HMOs with <7 rooms (or beds or occupants) as 1 residential unit; if ≥7, count HMO rooms gained/lost instead.\
          4. Count co-living, granny annexes (not ancillary annexes), and older people’s apartments as residential units.\
          5. Do not count 'communal space', 'garages', 'habitable spaces' or ‘habitable rooms’ or 'outbuildings' as residential units.\
          6. 'Buildings' / 'properties' can contain multiple units. Do not assume residential unit counts unless these are explicitly stated (see rule\
          2); if vague, return -1.\
          7. Count C1/C2/assisted living homes as 0 residential units unless self-contained.\
          8. Units gained will usually be predicated by terms like 'erection', 'new build', sometimes (but not always) by 'extension'; unless demolition\
          is stated, units lost are zero.\
          9. 'Change of use' applications should have either gains or losses; if there is also a 'conversion', there should be gain as well as a loss.\
          10. ‘Conversions’ (e.g., reconfigurations) don’t cause unit changes unless losing or adding residential unit(s).\
          11. Return -1 for vague terms (e.g., 'several dwellings', unquantified HMO) or unclear unit counts.\
          12. Do not infer unit counts from addresses; return -1 where units are unclear.\
          13. Units lost may also be predicated by terms like 'demolition', 'amalgamation', 'replacement'.\
          14. 'Alterations' or 'remodellings' usually do not cause units changes. Assume no change unless stated otherwise.\
          15. Applications referencing non-C3/C4 classes (e.g., A1, B2) are not residential units unless stated otherwise.\
          16. For phased applications, consider only units proposed or lost in the phase described; unless total units are mentioned, in which case count\
          total units.\
          17. Ignore any 50+ character text within quotation marks, brackets, or following a colon.\

          Outputs:\
          Generate:\
          1. Total proposed units,\
          2. Total existing units,\
          3. Proposed HMOs rooms (≥7 rooms),\
          4. Existing HMOs rooms (≥7 rooms), and\
          5. A short comment (max. 150 characters) explaining ambiguities or rule applications.".strip()

In [None]:
def cleanup(df):
    """function to clean up datasets obtained from the database, drop non-residential applications, rename relevant columns and fill null values"""
    print("Dataframe shape before cleanup: {}".format(df.shape))
    df['description'] = df['description'].apply(lambda x: str(x).replace("\r", "").replace("\n", ""))
    df = df[~(df['application_type_full'].isin(["Approval of details reserved by a condition (discharge)",
                                                "Prior Approval: Development for electronic communications network",
                                                "S73 Minor Material Amendment", "Removal/Variation of a condition",
                                                "Listed building consent", "Non-Material Amendment",
                                                "Consent to display an advertisement", "Householder planning permission",
                                                "Householder planning & listed building consent",
                                                "Prior Approval: Roof mounted solar PV on non-domestic building",
                                                "Tree works: Trees in conservation areas/subject to TPOs",
                                                "Prior Approval: Building for agricultural/forestry use",
                                                "Prior Approval: Private road for agricultural/forestry use",
                                                "Prior Approval: Change of use - agriculture to flexible commercial use",
                                                "Prior Approval: Temporary use for commercial film-making"]))]
    df = df[~(df['description'].str.contains('non-material amendment', case=False, na=False, regex = False))]
    df = df[~(df['description'].str.contains('planning permission', case=False, na=False, regex = False))]
    df = df[~(df['description'].str.contains('variation', case=False, na=False, regex = False))]
    df = df[~(df['description'].str.contains('condition', case=False, na=False, regex = False))]
    df = df[~(df['description'].str.contains('pursuant to', case=False, na=False, regex = False))]
    df = df[~(df['description'].str.contains('reference', case=False, na=False, regex = False))]
    df.rename(columns = {'Total Proposed Units': 'Units Gained', 'Total Lost Units': 'Units Lost'}, inplace = True)
    df = df.fillna(0)
    print("Dataframe shape after cleanup: {}".format(df.shape))
    return df

class Units(BaseModel):
    """class to format the genAI model outputs"""
    proposed_units: int
    existing_units: int
    proposed_hmo_rooms: int
    existing_hmo_rooms: int
    comments: str

def comp(given_prompt, query, model="gpt-4o"):
    """function to take in a prompt, a query (or description) and a preferred openAI model, run the given prompt on the model and return output in Units class format"""
    completion = openai.beta.chat.completions.parse(model = model,
                                                    messages = [{"role": "system", "content": given_prompt},
                                                                {"role": "user", "content": "Description:\n" + query}
                                                               ],
                                                    response_format = Units
                                                   )
    event = completion.choices[0].message.parsed
    return event

def generate_ai_fields(df, clean_data = True, save_path = save_path, output_name = "unnamed", prompt = prompt):
    """Function to run genAI model on the dataframe and extract units gained, units lost, hmo rooms gained, hmo units lost and comments columns from the\
       outputs. Also to create columns flagging applications where genAI disagrees with the source data"""
    if clean_data == False:
      df = cleanup(df)
    print("Processing {}".format(output_name))
    df['AI results'] = df.progress_apply(lambda x: comp(clu_prompt, x['description'], "gpt-4o") if x['application_type_full'].str.contains("Lawful development", case=False, na=False, regex = False) else comp(prompt, x['description'], "gpt-4o"), axis = 1) # using progress_apply to show how many rows have been processed
    df['AI Units Gained'] = df['AI results'].apply(lambda x: x.proposed_units) # extract units gained
    df['AI Units Lost'] = df['AI results'].apply(lambda x: x.existing_units) # extract units lost
    df['AI Units HMO Gained'] = df['AI results'].apply(lambda x: x.proposed_hmo_rooms) # extract HMO rooms gained
    df['AI Units HMO Lost'] = df['AI results'].apply(lambda x: x.existing_hmo_rooms) # extract HMO rooms lost
    df['AI Comments'] = df['AI results'].apply(lambda x: x.comments)
    df = df.replace(-1, "Indecipherable") # replace
    df['Same Units Gained'] = df['Units Gained'] == df['AI Units Gained']
    df['Same Units Lost'] = df['Units Lost'] == df['AI Units Lost']

    # save output on the drive (useful if processibg multiple batches to prevent data loss in case of code break)
    df[(df['Same Units Gained'] == False) | (df['Same Units Lost'] == False)].to_csv(
    r"{}\Flagged Apps in Batches\{}.csv".format(save_path, output_name), index = False) # export applications where ai disagrees with the source data
    df.to_csv(r"{}\Randomized Apps in Batches\{}.csv".format(save_path, output_name), index = False) # export all ai's inferences

    return df[(df['Same Units Gained'] == False) | (df['Same Units Lost'] == False)] # return output

#### **Run Model on Decided Applications Data**

In [None]:
# read in decided applications
data = pd.read_csv(r"{}/2024 AI Description Checks v2.csv".format(save_path), encoding='unicode_escape')
data = cleanup(data)

In [None]:
# split the data into 8 for batch processing
data1, data2, data3, data4, data5, data6, data7, data8 = np.split(data.sample(frac=1, random_state=42),
                                                                  [int(.125*len(data)), int(.25*len(data)), int(.375*len(data)), int(.5*len(data)),
                                                                   int(.625*len(data)), int(.75*len(data)), int(.875*len(data))])
data1.shape, data2.shape, data3.shape, data4.shape, data5.shape, data6.shape, data7.shape, data8.shape

In [None]:
# test that split dfs have no repeated applications (concatenated dataframe rows should equal intial dataframe rows)
flagged_dfs = [data1, data2, data3, data4, data5, data6, data7, data8]
pd.concat([data1, data2, data3, data4, data5, data6, data7, data8]).shape

In [None]:
# generate ai fields for applications in batch dfs
for i, df in enumerate(flagged_dfs):
    generate_ai_fields(df, True, save_path, "Flagged Apps - Batch {}}".format(i))

In [None]:
# concatenate
flagged = pd.concat([data1, data2, data3, data4, data5, data6, data7, data8])
# export flagged applications in single file (optional)
flagged.to_csv(
r"{}\Flagged Apps in Batches\AI Decription Checks (2024 approved apps) - reprocessed v2.csv".format(save_path),
        index = False)
# show distribution of flagged applications per planning authority
flagged['lpa_name'].value_counts()

In [None]:
# !jupyter nbconvert --to html application_units_generator.ipynb