In [3]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# =============================================================================
# Load and Inspect Raw Data from pp-complete.csv
# =============================================================================

# Define the column names for the dataset
colnames = [
    'Transaction_unique_identifier', 'price', 'Date_of_Transfer',
    'postcode', 'Property_Type', 'Old/New',
    'Duration', 'PAON', 'SAON',
    'Street', 'Locality', 'Town/City',
    'District', 'County', 'PPDCategory_Type',
    'Record_Status - monthly_file_only'
]

# Path to the complete dataset file in the local working directory
data_file = '/kaggle/input/price-paid-data-202304/pp-complete.csv'

# Read the CSV file using pandas (note: 'infer_datetime_format' has been removed as it's deprecated)
df_raw = pd.read_csv(
    data_file,
    header=None,
    names=colnames,
    parse_dates=["Date_of_Transfer"],
    dayfirst=False
)

# Create a copy of the data for further processing
df = df_raw.copy()

# -----------------------------------------------------------------------------
# Function to inspect the dataset
# -----------------------------------------------------------------------------
def inspect_data(data):
    """
    Generate a summary DataFrame for the given data.
    
    The summary includes:
    - Data types of each column ('dType')
    - Count of unique values in each column ('unique_amount')
    - Unique values in each column ('uniques')
    - Count of missing values in each column ('qtd_null')
    - Percentage of missing values in each column ('pct_null')
    
    Parameters:
        data (pd.DataFrame): The DataFrame to inspect.
    
    Returns:
        pd.DataFrame: A DataFrame summarizing the above information.
    """
    return pd.DataFrame({
        "dType": data.dtypes,
        "unique_amount": data.apply(lambda x: len(x.unique()), axis=0),
        "uniques": data.apply(lambda x: x.unique(), axis=0),
        "qtd_null": data.apply(lambda x: x.isnull().sum(), axis=0),
        "pct_null": data.apply(lambda x: x.isnull().sum() * 100 / len(x), axis=0)
    })

# Inspect the dataset and display the summary
inspection_summary = inspect_data(df)
print(inspection_summary)
