#### **Import Libraries and Define Functions**

In [1]:
import pandas as pd
import numpy as np
import os
from pydantic import BaseModel
import openai
from tqdm.notebook import tqdm
tqdm.pandas()
import warnings
warnings.simplefilter('ignore')

save_path = "..." # path to folder containing subfolders called Flagged Apps in Batches & Randomized Apps in Batches
openai.api_key = '...'

In [9]:
# create genAI prompt variable

clu_prompt = "AI Assistant Role: You assist a Planning Officer at the GLA. You do not make assumptions about development changes, instead you\
              specialize in inferring residential units change from continued (or lawful) planning applications’ descriptions. The interesting thing\
              about these applications is the change has already happened. Therefore, your job is first to determine whether the description is\
              referencing residential changes in the property, and second to count the number of units currently existing in the property using the\
              following rules and finally to return that value as the Total proposed units or Proposed HMO rooms as applicable:\
              1. Count only C3-use units i.e., 'houses,' 'homes,' 'flats,' 'self-contained units,' or  ‘dwellings’ as residential units unless\
              otherwise specified in rules 2–4.\
              2. Count HMOs with <7 rooms (or beds or occupants) or 'small HMOs' as 1 residential unit. If ≥7 rooms (or beds or occupants) or called\
              'large HMOs', return count of HMO rooms (or beds or occupants) as proposed or existing HMO rooms instead; return 'unclear' if vague.\
              3. Count co-living, granny annexes (not ancillary annexes), and older people’s apartments as residential units.\
              4. Do not count 'communal space', 'garages', 'habitable spaces' or ‘habitable rooms’ or 'outbuildings' as residential units.\
              5. Only infer units from explicit numbers like '5 flats' or ‘2 no. dwellinghouses’; assume 1 unit for phrases like 'a house'\
              or 'ground floor flat' unless otherwise specified.\
              6. Return 'unclear' for inexplicit unit counts, addresses or vague terms e.g., 'several dwellings', 'property', 'building', unquantified\
              HMO.\
              7. Sometimes, the prior use of the property will be mentioned in the description. In such a case, also return the count of the prior use\
              units as the total existing units output using rules 1 to 6 above. Return 0 if not mentioned at all.\
              8. Set all numerical output values to zero if the description only references changes to the structure of the property e.g.,'Installation\
              of roof lights on rear dormer of a dwellinghouse'.\
              Outputs:\
              1. Residential units change (True or False)\
              2. Total proposed units: Total number of residential units that will exist after the proposed changes are made,\
              3. Total existing units: Total number of residential units that existed before the proposed changes are made,\
              4. Proposed HMOs rooms (≥7 rooms): Total number of HMO units (rooms/beds/occupants) that will exist in the property after the proposed\
              changes are made,\
              5. Existing HMOs rooms (≥7 rooms): Total number of HMO units (rooms/beds/occupants) that existed in the property before the proposed\
              changes are made, and\
              6. A short comment (max. 150 characters) explaining ambiguities, rule applications or other reason(s) for results.\
              For example,\
              the description 'Use of a dwellinghouse as a 8-bedroom HMO' should return the following outputs: True, 0, 1, 8, 0 & a comment in the\
              respective outputs while the description 'Extension of roof to side to form gable, rear dormer and front roof lights.' should return\
              False, 0, 0, 0, 0 & a comment".strip()

prompt = "AI Assistant Role: You assist a Planning Officer at the GLA. From an application's description, you specialize in determininge whether or not\
          its proposed change will result in additional or fewer residential units in the property. Where it does not, return 0 across all numerical\
          outputs. Where it does, infer the number of existing and proposed residential units based on the following rules:\
          1. Count only C3-use units i.e., 'houses,' 'homes,' 'flats,' 'self-contained units,' or  ‘dwellings’ as residential units unless otherwise\
          specified in 2-4.\
          2. Count HMOs with <7 rooms (or beds or occupants) or 'small HMOs' as 1 residential unit. If ≥7 rooms (or beds or occupants) or called 'large\
          HMOs', return count of HMO rooms (or beds or occupants) as proposed or existing HMO rooms instead; return 'unclear' if vague.\
          3. Count co-living, granny annexes (not ancillary annexes), and older people’s apartments as residential units.\
          4. Do not count 'communal space', 'garages', 'habitable spaces' or ‘habitable rooms’ or 'outbuildings', C1/C2/assisted living homes, non-C3/C4\
          classes (e.g., A1, B2) as residential units.\
          5. Infer units only from explicit numbers like '5 flats' or ‘2 no. dwellinghouses’; assume 1 unit for phrases like 'a house' or 'ground\
          floor flat' unless otherwise specified.\
          6. Return 'unclear' for inexplicit unit counts or vague terms e.g., 'several dwellings', 'property', 'building', unquantified HMO.\
          7. Units gained will usually be predicated by terms like 'erection', 'new build', sometimes (but not always) by 'extension'.\
          9. 'Change of use' applications will either cause additional or fewer residential units unless there is also a 'conversion', in which case\
          there can be both.\
          10. 'Extensions' sometimes do not result in residential unit changes.\
          11. Do not infer unit counts from addresses; return 'unclear' where units are unclear.\
          12. For descriptions that cite 'demolitions', 'amalgamations' or 'replacements' of properties, set exisiting units to the number of\
          residential units that will be demolished, almagamated or replaced; return 'unclear' if units are unclear.\
          13. For phased applications, consider only residential units proposed or to be lost in the phase described.\
          14. Ignore any 50+ character text within quotation marks, brackets, or following a colon.\
          Outputs:\
          1. Residential units change (True or False)\
          2. Total proposed units: Total number of residential units that will exist after the proposed changes are made,\
          3. Total existing units: Total number of residential units that existed before the proposed changes are made,\
          4. Proposed HMOs rooms (≥7 rooms): Total number of HMO units (rooms/beds/occupants) that will exist in the property after the proposed\
          changes are made,\
          5. Existing HMOs rooms (≥7 rooms): Total number of HMO units (rooms/beds/occupants) that existed in the property before the proposed changes\
          are made, and\
          6. A short comment (max. 150 characters) explaining ambiguities, rule applications or other reason(s) for results.".strip()

In [4]:
def cleanup(df):
    """function to clean up datasets obtained from the database, drop non-residential applications, rename relevant columns and fill null values"""
    print("Dataframe shape before cleanup: {}".format(df.shape)) # print dataframe size before cleanup
    df['description'] = df['description'].apply(lambda x: str(x).replace("\r", "").replace("\n", "")) # remove linebreaks
    df['application_type_full'] = df['application_type_full'].replace({0: ""})
    df = df[~(df['application_type_full'].isin(["Approval of details reserved by a condition (discharge)",
                                                "Prior Approval: Development for electronic communications network",
                                                "S73 Minor Material Amendment", "Removal/Variation of a condition",
                                                "Listed building consent", "Non-Material Amendment",
                                                "Consent to display an advertisement", "Householder planning permission",
                                                "Householder planning & listed building consent",
                                                "Prior Approval: Roof mounted solar PV on non-domestic building",
                                                "Tree works: Trees in conservation areas/subject to TPOs",
                                                "Prior Approval: Building for agricultural/forestry use",
                                                "Prior Approval: Private road for agricultural/forestry use",
                                                "Prior Approval: Change of use - agriculture to flexible commercial use",
                                                "Prior Approval: Temporary use for commercial film-making", "nan"]))] # filter out non-res applications using categorical column
    df = df[~(df['description'].str.contains('non-material amendment|environmental impact assessment|planning permission|variation|condition|pursuant to|reference',
                                             case=False, na=False, regex = False))] # filter out non-res applications using free-text column
    df = df.fillna(0) # fill other missing values
    print("Dataframe shape after cleanup: {}".format(df.shape)) # print dataframe size after cleanup
    return df

class Units(BaseModel):
    """class to format the genAI model outputs. Setting all non boolean fields to string as unclear outputs are to return the string value 'unclear'"""
    residential_change: bool
    proposed_units: str
    existing_units: str
    proposed_hmo_rooms: str
    existing_hmo_rooms: str
    comments: str

def comp(given_prompt, query, model="gpt-4o"):
    """Function to take in a prompt, a query (or description) and a preferred openAI model, run the given prompt on the model and return output in
    Units class format"""
    completion = openai.beta.chat.completions.parse(model = model,
                                                    messages = [{"role": "system", "content": given_prompt},
                                                                {"role": "user", "content": "Description:\n" + query}
                                                               ],
                                                    response_format = Units
                                                   )
    event = completion.choices[0].message.parsed
    return event

def clu_or_full(row):
    """Function to run process the description using the clu_prompt if it is a continued or lawful use (clu) development"""
    if "Lawful development" in row['application_type_full']:
        return comp(clu_prompt, row['description'], "gpt-4o")
    else:
        return comp(prompt, row['description'], "gpt-4o")

def generate_ai_fields(df, clean_data = True, save_path = save_path, output_name = "unnamed"):
    """Function to run genAI model on the dataframe and extract units gained, units lost, hmo rooms gained, hmo units lost and comments columns from\
       the outputs. Also to create columns flagging applications where genAI disagrees with the source data"""
    if clean_data == False:
      df = cleanup(df) # cleanup dataframe if not already clean
    print("Processing {}⏳".format(output_name))
    df['AI results'] = df.progress_apply(clu_or_full, axis = 1) # using progress_apply to show how many rows have been processed
    df['Residential Change'] = df['AI results'].apply(lambda x: x.residential_change) # will the app cause a residential units increase/decrease?
    df['AI Units Gained'] = df['AI results'].apply(lambda x: x.proposed_units) # extract units gained
    df['AI Units Lost'] = df['AI results'].apply(lambda x: x.existing_units) # extract units lost
    df['AI Units HMO Gained'] = df['AI results'].apply(lambda x: x.proposed_hmo_rooms) # extract HMO rooms gained
    df['AI Units HMO Lost'] = df['AI results'].apply(lambda x: x.existing_hmo_rooms) # extract HMO rooms lost
    df['AI Comments'] = df['AI results'].apply(lambda x: x.comments) # extract AI comments
    df['Same Units'] = df['DB Units Gained'] == df['AI Units Gained'].replace({'unclear': 0}) | df['DB Units Lost'] == df['AI Units Lost'].replace({'unclear': 0}) # create column to check where AI disagrees with the database

    # save output on the drive (useful if processibg multiple batches to prevent data loss in case of code break)
    output = df[df['Same Units'] == False]
    output.to_csv(r"{}\Flagged Apps in Batches\{}.csv".format(save_path, output_name), index = False) # export applications where ai disagrees with the source data
    df.to_csv(r"{}\AI Inferences in Batches\{}.csv".format(save_path, output_name), index = False) # export all AI's inferences
    print("{} applications were flagged in this batch".format(output.shape[0])) # print number of applications flagged in each processing batch
    return output # return output

In [10]:
query = "Use as a 6-bedroom HMO"
comp(clu_prompt, query)

Units(residential_change=True, proposed_units='1', existing_units='0', proposed_hmo_rooms='0', existing_hmo_rooms='0', comments='6-bedroom HMOs are counted as 1 residential unit.')

#### **Run Model on Decided Applications Data**

In [None]:
# read in decided applications
data = pd.read_excel(r"{}\2024 AI Description Checks v2.xlsx".format(save_path))
data = cleanup(data)

In [None]:
# take out apps not flagged in previous run
processed_data = pd.read_csv(r"{}\Randomized Apps in Batches\Flagged Applications - Batch 0".format(save_path))
flagged_data = pd.read_csv(r"{}\Flagged Apps per Batch\Flagged Applications - Batch 0".format(save_path))
to_exclude = set(processed_data.id.values) - set(flagged_data.id.values)
data = data[~data['id'].isin(to_exclude)]

Dataframe shape before cleanup: (48, 11)
Dataframe shape after cleanup: (48, 11)
Processing Flagged Apps rerun (Havering)⏳


  0%|          | 0/48 [00:00<?, ?it/s]

43 applications were flagged in this batch


Unnamed: 0,Count of id,id,lpa_name,lpa_app_no,decision,decision_date,description,Units Gained,Units Lost,application_type,application_type_full,AI results,AI Units Gained,AI Units Lost,AI Units HMO Gained,AI Units HMO Lost,AI Comments,Same Units Gained,Same Units Lost
12696,1,Havering-E0011.23,Havering,E0011.23,Approved,2024-03-13,The use of the property as three self-containe...,6.0,2.0,All Other,Lawful development: Existing use,proposed_units=3 existing_units=0 proposed_hmo...,3,0,0,0,Counted 3 self-contained units as residential ...,False,False
12701,1,Havering-J0001.24,Havering,J0001.24,Approved,2024-03-04,The proposed development comprises the change ...,0.0,0.0,Prior Approval,,proposed_units=4 existing_units=0 proposed_hmo...,4,0,0,0,4 new self-contained dwellings proposed from f...,False,True
12703,1,Havering-J0003.24,Havering,J0003.24,Approved,2024-04-24,Application for prior notification of developm...,0.0,0.0,Prior Approval,,proposed_units=5 existing_units=0 proposed_hmo...,5,0,0,0,Proposes 5 new self-contained flats; no existi...,False,True
12704,1,Havering-J0004.24,Havering,J0004.24,Approved,2024-03-27,Application for prior notification of developm...,0.0,0.0,Prior Approval,,proposed_units=1 existing_units=0 proposed_hmo...,1,0,0,0,"1 new dwellinghouse proposed, change of use fr...",False,True
12705,1,Havering-J0005.24,Havering,J0005.24,Approved,2024-03-27,"Prior Approval for a Change of Use to 2 Flats,...",0.0,0.0,Prior Approval,,proposed_units=2 existing_units=0 proposed_hmo...,2,0,0,0,Change of use indicates gain; internal reconfi...,False,True
12706,1,Havering-J0006.24,Havering,J0006.24,Approved,2024-03-27,Change of use from use class E to use class C3,0.0,0.0,Prior Approval,,proposed_units=-1 existing_units=-1 proposed_h...,Indecipherable,Indecipherable,0,0,No explicit unit numbers; unclear how many uni...,False,False
12707,1,Havering-J0016.24,Havering,J0016.24,Approved,2024-08-15,Application for prior notification of developm...,0.0,0.0,Prior Approval,,proposed_units=-1 existing_units=0 proposed_hm...,Indecipherable,0,0,0,No explicit number of proposed residential uni...,False,True
12708,1,Havering-J0017.24,Havering,J0017.24,Approved,2024-09-10,Application for prior notification for Change ...,0.0,0.0,Prior Approval,,proposed_units=-1 existing_units=0 proposed_hm...,Indecipherable,0,0,0,Units unclear; no explicit numbers provided in...,False,True
12711,1,Havering-J0045.23,Havering,J0045.23,Approved,2024-02-07,Application for Prior Notification of Change o...,0.0,0.0,Prior Approval,,proposed_units=2 existing_units=0 proposed_hmo...,2,0,0,0,Conversion from retail to 2 dwellings; no exis...,False,True
12757,1,Havering-P0006.24,Havering,P0006.24,Approved,2024-02-29,Addition of first floor to create a chalet bun...,0.0,0.0,All Other,,proposed_units=1 existing_units=1 proposed_hmo...,1,1,0,0,No change in units; the proposal is for altera...,False,False


In [None]:
# split the data into 8 for batch processing
data1, data2, data3, data4, data5, data6, data7, data8 = np.split(data.sample(frac=1, random_state=42),
                                                                  [int(.125*len(data)), int(.25*len(data)), int(.375*len(data)), int(.5*len(data)),
                                                                   int(.625*len(data)), int(.75*len(data)), int(.875*len(data))])
data1.shape, data2.shape, data3.shape, data4.shape, data5.shape, data6.shape, data7.shape, data8.shape

In [None]:
# test that split dfs have no repeated applications (concatenated dataframe rows should equal intial dataframe rows)
flagged_dfs = [data1, data2, data3, data4, data5, data6, data7, data8]
pd.concat([data1, data2, data3, data4, data5, data6, data7, data8]).shape

In [None]:
generate_ai_fields(data, True, save_path, "Flagged Apps - Batch 1")

Processing Flagged Apps - Batch 1⏳


  0%|          | 0/986 [00:00<?, ?it/s]

TypeError: argument of type 'float' is not iterable

In [None]:
# generate ai fields for applications in batch dfs
for i, df in enumerate(flagged_dfs):
    generate_ai_fields(df, True, save_path, "Flagged Apps - Batch {}".format(i+1))

In [None]:
# concatenate
flagged = pd.concat([data1, data2, data3, data4, data5, data6, data7, data8])
# export flagged applications in single file (optional)
flagged.to_csv(r"{}\Flagged Apps in Batches\AI Decription Checks (2024 approved apps) - reprocessed v2.csv".format(save_path),
               index = False)
# show distribution of flagged applications per planning authority
flagged['lpa_name'].value_counts()

In [None]:
# !jupyter nbconvert --to html application_units_generator.ipynb