In [2]:
# Import package for reading csv files
import pandas as pd

# Import package numpy for numeric computing
import numpy as np

# Import package matplotlib for visualisation/plotting
import matplotlib.pyplot as plt

#For showing plots directly in the notebook run the command below
%matplotlib inline

In [3]:
def build_dqr_table(df, columns, csv_filename):
    """
    Build and save a Data Quality Report table.
    
    Parameters:
    -----------
    df : pandas.DataFrame
    columns : Index
        Features to include
    csv_filename : str
        Output filename
    
    Returns:
    --------
    pandas.DataFrame
        Complete DQR table
    """
    stats = df[columns].describe().T
    perc_missing = 100 * (df[columns].isnull().sum() / len(df))
    cardinality = df[columns].nunique()
    
    dqr_table = pd.concat([
        stats,
        pd.DataFrame(perc_missing, columns=['%missing']),
        pd.DataFrame(cardinality, columns=['cardinality'])
    ], axis=1)
    
    dqr_table.to_csv(csv_filename, index_label='Feature')
    return dqr_table


print("✓ Helper function loaded successfully")

✓ Helper function loaded successfully


In [4]:
try:
    # Reading from a csv file, into a data frame
    df = pd.read_csv('../data/ppr-group-25200353-train.csv',
                    keep_default_na=True,
                    delimiter=',',
                    skipinitialspace=True)
    
    print(f"✓ Successfully loaded dataset with {df.shape[0]} rows and {df.shape[1]} columns")

    display(df.head(3))
    
except FileNotFoundError:
    print("ERROR: The file '../data/ppr-group-25200353-train.csv' was not found.")
    print("Please ensure the file is in the current working directory.")
except Exception as e:
    print(f"ERROR: An unexpected error occurred: {e}")

✓ Successfully loaded dataset with 54000 rows and 9 columns


Unnamed: 0,Date of Sale (dd/mm/yyyy),Address,County,Eircode,Price (€),Not Full Market Price,VAT Exclusive,Description of Property,Property Size Description
0,29/02/2016,"APT 14, RUSSELL COURT, FELTRIM RD",Dublin,,"€118,487.00",No,No,Second-Hand Dwelling house /Apartment,
1,08/07/2016,"19 PARK VILLAS, DEMESNE RD, DUNDALK",Louth,,"€222,000.00",No,No,Second-Hand Dwelling house /Apartment,
2,15/04/2016,"No 2 The Grove, Cahereen Heights, Castleisland",Kerry,,"€123,348.00",No,Yes,New Dwelling house /Apartment,greater than 125 sq metres


In [5]:
# QUESTION: Does the loaded dataset have the expected structure?
# Validate that the dataframe has data and expected columns

print("Dataset Validation Checks:")
print("="*60)

# Check 1: Dataset is not empty
if df.empty:
    print("⚠ WARNING: Dataset is empty!")
else:
    print(f"✓ Dataset contains {df.shape[0]:,} rows")

# Check 2: Check for duplicate rows
duplicate_count = df.duplicated().sum()
if duplicate_count > 0:
    print(f"⚠ WARNING: Found {duplicate_count} duplicate rows")
else:
    print("✓ No duplicate rows found")

# Check 3: Check if all columns are unnamed
unnamed_cols = [col for col in df.columns if 'Unnamed' in str(col)]
if unnamed_cols:
    print(f"⚠ WARNING: Found {len(unnamed_cols)} unnamed columns: {unnamed_cols}")
else:
    print("✓ All columns have names")

# Check 4: Basic column overview
print(f"\n✓ Dataset has {df.shape[1]} columns:")
print(f"  - Column names: {list(df.columns)}")

Dataset Validation Checks:
✓ Dataset contains 54,000 rows
✓ All columns have names

✓ Dataset has 9 columns:
  - Column names: ['Date of Sale (dd/mm/yyyy)', 'Address', 'County', 'Eircode', 'Price (€)', 'Not Full Market Price', 'VAT Exclusive', 'Description of Property', 'Property Size Description']


In [6]:
# QUESTION: What are the dimensions of our dataset? How many rows and columns?
# Check how many rows and columns this dataframe has
print(f"Dataset shape: {df.shape}")
print(f"Number of rows (instances): {df.shape[0]}")
print(f"Number of columns (features): {df.shape[1]}")

Dataset shape: (54000, 9)
Number of rows (instances): 54000
Number of columns (features): 9


In [7]:
# QUESTION: What are the data types of each feature? Are they assigned correctly?
# Show the data types in each column as assigned by default by pandas when reading the csv file.
# int64 and float64 are numeric columns.
# The pandas type 'object' stands for Python strings
# Columns mixing numbers and characters are automatically converted to type 'object'.
# Some of the type 'object' columns may be more appropriate as continuous or as categorical types.
# We first explore the data, then decide which columns should be treated as 'continuous' and which 
# should be treated as 'categorical'.
# You can see below that some columns are wrongly set to continuous types, e.g., ID.
df.dtypes

Date of Sale (dd/mm/yyyy)    str
Address                      str
County                       str
Eircode                      str
Price (€)                    str
Not Full Market Price        str
VAT Exclusive                str
Description of Property      str
Property Size Description    str
dtype: object

In [8]:
# QUESTION: What are the summary statistics (mean, std, min, max, quartiles) for numeric features?
# Descriptive stats for continuous features
# df.describe() by default only prints stats for the numeric columns int64 and float64.
# Often we need to change the feature data type so it more appropriately reflects whether this is 
# a continuous or a categorical feature.
pd.set_option("display.precision", 2)
df.describe().T

Unnamed: 0,count,unique,top,freq
Date of Sale (dd/mm/yyyy),54000,2497,18/12/2020,103
Address,54000,53293,"Broomfield, Midleton",4
County,54000,26,Dublin,16577
Eircode,16897,16840,D12TD52,3
Price (€),54000,6788,"€250,000.00",594
Not Full Market Price,54000,2,No,51150
VAT Exclusive,54000,2,No,44530
Description of Property,54000,3,Second-Hand Dwelling house /Apartment,44366
Property Size Description,2799,4,greater than or equal to 38 sq metres and less...,2088


In [9]:
# QUESTION: What is the complete information about the dataframe including non-null counts and memory usage?
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 54000 entries, 0 to 53999
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   Date of Sale (dd/mm/yyyy)  54000 non-null  str  
 1   Address                    54000 non-null  str  
 2   County                     54000 non-null  str  
 3   Eircode                    16897 non-null  str  
 4   Price (€)                  54000 non-null  str  
 5   Not Full Market Price      54000 non-null  str  
 6   VAT Exclusive              54000 non-null  str  
 7   Description of Property    54000 non-null  str  
 8   Property Size Description  2799 non-null   str  
dtypes: str(9)
memory usage: 3.7 MB


In [10]:
# QUESTION: Which features are numeric? Extract only numeric columns for separate analysis.
#Keep only the numeric features.
numeric_columns = df.select_dtypes(['int64', 'float64']).columns
numeric_columns

Index([], dtype='str')

In [11]:
# QUESTION: How many unique values (cardinality) does each numeric feature have?

for column in numeric_columns:
    print(f"\nFeature: {column}") 
    print(f"Number of distinct categories: {len(df[column].unique())}")
    print(f"Unique values: {df[column].unique()}")
    

### Correcting Feature Data Types

Some features were incorrectly classified. Specifically:
- **Price (€)**: Should be numerical (remove the price formatting)
- **County**: Low cardinality (categorical)
- **Not Full Market Price/VAT Exclusive**: Yes/No options should be categorical
- **Description of Property/Property Size Description**: Low cardinality, suggesting it could be categorical

| Feature | Type Correction |
----------|-----------------|
| Date of Sale (dd/mm/yyyy) | object |
|Address                    | object |
|County                     | object -> categorical |
|Eircode                    | object |
|Price (€)                  | object -> numeric |
|Not Full Market Price      | object -> categorical |
|VAT Exclusive              | object -> categorical |
|Description of Property    | object -> categorical |
|Property Size Description  | object -> categorical |

In [12]:
df

Unnamed: 0,Date of Sale (dd/mm/yyyy),Address,County,Eircode,Price (€),Not Full Market Price,VAT Exclusive,Description of Property,Property Size Description
0,29/02/2016,"APT 14, RUSSELL COURT, FELTRIM RD",Dublin,,"€118,487.00",No,No,Second-Hand Dwelling house /Apartment,
1,08/07/2016,"19 PARK VILLAS, DEMESNE RD, DUNDALK",Louth,,"€222,000.00",No,No,Second-Hand Dwelling house /Apartment,
2,15/04/2016,"No 2 The Grove, Cahereen Heights, Castleisland",Kerry,,"€123,348.00",No,Yes,New Dwelling house /Apartment,greater than 125 sq metres
3,21/12/2016,"APT 4 BLOCK 6, WOODFORD, WHEATON HALL",Louth,,"€115,000.00",No,No,Second-Hand Dwelling house /Apartment,
4,11/10/2016,"18 GERALDINE ST, DUBLIN 7, DUBLIN",Dublin,,"€225,500.00",No,No,Second-Hand Dwelling house /Apartment,
...,...,...,...,...,...,...,...,...,...
53995,05/01/2024,"BROCKA, BALLINDERRY, NENAGH",Tipperary,E45H500,"€497,000.00",No,No,Second-Hand Dwelling house /Apartment,
53996,05/07/2024,"6 BEECHLAWN, SOUTH HILL AVE, BOOTERSTOWN",Dublin,A94P231,"€530,388.00",No,No,Second-Hand Dwelling house /Apartment,
53997,19/07/2024,"2 MAIN ST, WHITEGATE, CORK",Cork,P25E5X4,"€200,000.00",No,No,Second-Hand Dwelling house /Apartment,
53998,22/01/2024,"164 CLUAIN LARACH, KNOCKENDUFF, TRAMORE",Waterford,X91YN9W,"€360,000.00",No,No,Second-Hand Dwelling house /Apartment,
