## Phase 1: Code for Data Loading, Cleaning & Feature Engineering
### Part 1: Setup and Initial Loading

In [8]:
# ---------------------------------------------------------------------------
# 1. SETUP & INITIAL DATA LOADING
# ---------------------------------------------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (14, 8) # Set default figure size

# Load the dataset from the CSV file into a pandas DataFrame
df = pd.read_csv('perth_property_data.csv')

# --- Initial Inspection ---

# Display a concise summary of the DataFrame.
print("\nDataFrame Info:")
df.info()


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42958 entries, 0 to 42957
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Listing_ID                 42958 non-null  int64  
 1   Price                      42958 non-null  int64  
 2   Agency_Name                42958 non-null  object 
 3   Postcode                   42958 non-null  int64  
 4   Address                    42958 non-null  object 
 5   Suburb                     42958 non-null  object 
 6   Longitude                  42958 non-null  float64
 7   Latitude                   42958 non-null  float64
 8   Property_Type              42958 non-null  object 
 9   Bedrooms                   42958 non-null  int64  
 10  Bathrooms                  42958 non-null  int64  
 11  Parking_Spaces             42958 non-null  int64  
 12  Date_Sold                  42958 non-null  object 
 13  Land_Size                  42

In [7]:
# Generate descriptive statistics for numerical columns.
print("\nDescriptive Statistics:")
print(df.describe())


Descriptive Statistics:
         Listing_ID         Price      Postcode     Longitude      Latitude  \
count  4.295800e+04  4.295800e+04  42958.000000  42958.000000  42958.000000   
mean   1.391644e+08  8.610860e+05   6083.335840    115.867264    -31.956828   
std    6.312142e+06  5.956430e+05     54.925532      0.077023      0.087968   
min    1.014242e+08  1.000000e+00   6003.000000    115.732150    -32.133780   
25%    1.375409e+08  5.100000e+05   6026.000000    115.804930    -32.034580   
50%    1.409794e+08  6.875000e+05   6064.000000    115.854050    -31.948990   
75%    1.432736e+08  9.800000e+05   6149.000000    115.923790    -31.886392   
max    1.457244e+08  6.350000e+06   6166.000000    116.064601    -31.774076   

           Bedrooms     Bathrooms  Parking_Spaces     Land_Size  \
count  42958.000000  42958.000000    42958.000000  4.295800e+04   
mean       3.366614      1.731831        2.039643  4.179946e+03   
std        0.878614      0.643763        1.129544  7.392181e+0

In [9]:
# --- Load and Apply Manual Corrections ---
df_raw = pd.read_csv('perth_property_data.csv')
try:
    # Load the corrections file
    df_corrections = pd.read_csv('corrections.csv')
    print(f"Found and loaded {len(df_corrections)} rules from corrections.csv.")
    
    df = df_raw.copy()
    
    # Step 1: Handle Deletions
    # Identify all Listing_IDs marked for deletion.
    ids_to_delete = df_corrections[df_corrections['New_Value'] == 'DELETE_ROW']['Listing_ID'].astype(int).tolist()
    
    if ids_to_delete:
        initial_rows = len(df)
        # Use .isin() to filter out the rows to be deleted. The ~ inverts the selection.
        df = df[~df['Listing_ID'].isin(ids_to_delete)]
        print(f"Deleted {initial_rows - len(df)} rows based on DELETE_ROW rules.")
    
    # Step 2: Handle Value Updates
    # Filter for rules that are not deletions.
    corrections_to_apply = df_corrections[df_corrections['New_Value'] != 'DELETE_ROW'].copy()
    # Ensure data types are correct for matching
    corrections_to_apply['Listing_ID'] = corrections_to_apply['Listing_ID'].astype(int)
    corrections_to_apply['New_Value'] = pd.to_numeric(corrections_to_apply['New_Value'])

    # Iterate through each correction rule and apply it using .loc for precision.
    for index, rule in corrections_to_apply.iterrows():
        listing_id = rule['Listing_ID']
        column = rule['Column_To_Correct']
        new_value = rule['New_Value']
        
        # This command finds the exact row(s) and column and sets the new value.
        df.loc[df['Listing_ID'] == listing_id, column] = new_value
        print(f"Updated Listing_ID {listing_id}: Set column '{column}' to {new_value}.")

except FileNotFoundError:
    print("Warning: corrections.csv not found. Proceeding with raw data.")
    df = df_raw.copy()

# --- Manual Correction Complete ---
# All subsequent cleaning and analysis will be performed on the 'df' DataFrame.
print(f"\nData ready for automated cleaning. Current rows: {len(df)}.")

Found and loaded 8 rules from corrections.csv.
Deleted 4 rows based on DELETE_ROW rules.
Updated Listing_ID 138928707: Set column 'Price' to 730000.
Updated Listing_ID 131703414: Set column 'Price' to 750000.
Updated Listing_ID 142122104: Set column 'Price' to 530000.
Updated Listing_ID 142973724: Set column 'Land_Size' to 404.

Data ready for automated cleaning. Current rows: 42954.


In [10]:
# ---------------------------------------------------------------------------
# 2. AUTOMATED CLEANING & FEATURE ENGINEERING
# ---------------------------------------------------------------------------

# --- 2.1 Data Type Conversion ---
# Convert the 'Date_Sold' column from object (text) to datetime objects.
df['Date_Sold'] = pd.to_datetime(df['Date_Sold'], errors='coerce')

# Check if any dates failed to convert
if df['Date_Sold'].isnull().any():
    print("Warning: Some dates could not be parsed and have been set to NaT.")

print("Successfully converted 'Date_Sold' to datetime objects.")

Successfully converted 'Date_Sold' to datetime objects.


  df['Date_Sold'] = pd.to_datetime(df['Date_Sold'], errors='coerce')


In [None]:
# ---------------------------------------------------------------------------
# 2. TECHNICAL CLEANING & FEATURE ENGINEERING
#
# Decision: Based on data validation, the extreme values for both 'Price' and 
# 'Land_Size' are considered legitimate and will be retained to ensure a
# complete market representation. Automated outlier removal will be skipped.
# ---------------------------------------------------------------------------

# --- 2.1 Data Type Conversion ---
# convert 'Date_Sold' column from an object (text) to a datetime object.

print("Starting technical cleaning...")
df['Date_Sold'] = pd.to_datetime(df['Date_Sold'], errors='coerce')

# Optional but recommended: Check for and handle any conversion errors.
if df['Date_Sold'].isnull().any():
    num_errors = df['Date_Sold'].isnull().sum()
    print(f"Warning: Found {num_errors} date(s) that could not be parsed. These rows will be dropped.")
    df.dropna(subset=['Date_Sold'], inplace=True)

print("Successfully converted 'Date_Sold' to datetime objects.")


# --- 2.2 Feature Engineering ---
# This step creates new, valuable features from existing data without altering it.
# We extract time-based features from the 'Date_Sold' column.

df['Sale_Year'] = df['Date_Sold'].dt.year
df['Sale_Month'] = df['Date_Sold'].dt.month
df['Sale_DayOfWeek'] = df['Date_Sold'].dt.dayofweek # Note: Monday=0, Sunday=6

print("Engineered new features: 'Sale_Year', 'Sale_Month', 'Sale_DayOfWeek'.")


# --- 2.3 Final Verification ---
# A final check to ensure our DataFrame is ready for analysis.

# We use .copy() here to create a definitive 'cleaned' version of the DataFrame.
# This can help prevent 'SettingWithCopyWarning' in later exploratory stages.
df_cleaned = df.copy()

print("\n--- Final DataFrame Info ---")
df_cleaned.info()

print("\n--- Final Descriptive Statistics ---")
print(df_cleaned.describe())

print("\nTechnical cleaning and feature engineering complete. The dataset is ready for EDA.")

Starting technical cleaning...
Successfully converted 'Date_Sold' to datetime objects.
Engineered new features: 'Sale_Year', 'Sale_Month', 'Sale_DayOfWeek'.

--- Final DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
Index: 42954 entries, 0 to 42954
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Listing_ID                 42954 non-null  int64         
 1   Price                      42954 non-null  int64         
 2   Agency_Name                42954 non-null  object        
 3   Postcode                   42954 non-null  int64         
 4   Address                    42954 non-null  object        
 5   Suburb                     42954 non-null  object        
 6   Longitude                  42954 non-null  float64       
 7   Latitude                   42954 non-null  float64       
 8   Property_Type              42954 non-null  object        
 9   Bedrooms   