# 1. Cleaning

Some notes about the clean dataset in **data/properties.csv** :

- There are about 76 000 properties, roughly equally spread across houses and apartments
- Each property has a unique identifier **id**
- The target variable is **price**
- Variables prefixed with **fl_** are dummy variables (1/0)
- Variables suffixed with **_sqm** indicate the measurement is in square meters
- All missing categories for the categorical variables are encoded as **MISSING**

## Exploring the dataset

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import csv

In [None]:
# Read the csv file
df = pd.read_csv("../data/properties.csv")

In [None]:
# Display the head
df.head()

In [None]:
print("There are {} rows of data".format(len(df)))

In [None]:
# (rows,columns)
df.shape

In [None]:
# Describe index
df.index 

In [None]:
# Describe columns
df.columns

In [None]:
# Info on df
df.info()

In [None]:
# Descriptive statistics
df.describe()

In [None]:
# Number of non-NA values
df.count()

In [None]:
# descriptive statistics for all columns in df, including both numeric and non-numeric (categorical) columns

df.describe(include="all").T  # Transpose the data frame so that it fits in a cell

In [None]:
# check for missing (NaN or null) values in each column, 
# count the number of missing values per column, 
# and then sort the results in descending order

df.isna().sum().sort_values(ascending=False)

In [None]:
# There are no duplicates
df.duplicated().any()

In [None]:
# See unique values in dataframe
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Unique values in column '{column}' is : {unique_values}")

## Cleaning the data

In [None]:
# Replace values "missing" by NAN
missing_column = ["region", "province", "locality", "equipped_kitchen", "state_building", "epc", "heating_type"]
df_missing_col = df[missing_column].replace('MISSING', np.NAN, inplace=False)
display(df_missing_col)

# Replace the "MISSING" values with NaN
df_missing = df.replace('MISSING', np.NAN, inplace=False)
display(df_missing)

In [None]:
df_missing.isna().sum().sort_values(ascending=False)

In [None]:
df_missing.describe(include="all").T

In [None]:
df_missing.dtypes

In [None]:
# Replace the "MISSING" values with NaN
df = df.replace('MISSING', np.NAN, inplace=False)
display(df.head())

### Drop columns: > 50% Nan and specific columns

In [None]:
# Calculate the percentage of missing values in each column
missing_percentage = df.isna().mean() * 100

# Filter columns where missing percentage is greater than 50%
columns_to_drop = missing_percentage[missing_percentage > 50].index

# Drop columns with more than 50% missing values
df_drop = df.drop(columns=columns_to_drop, inplace=False)

# Print the columns that were dropped
b = df_drop.columns.symmetric_difference(df.columns)
print ("Uncommon Columns:",b)
display(df_drop.head())


In [None]:
# Drop specific columns
df_dropID = df.drop(["id"], axis=1)
display(df_dropID.head())

# Drop longitude and latitude
df = df.drop(["id", "latitude", "longitude"], axis=1)
display(df.head())

### Unique values

In [None]:
# See unique values in dataframe
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Unique values in column '{column}' is : {unique_values}")

In [None]:
# See unique values of a specific column
unique_value = df["subproperty_type"].unique()
print(unique_value)

# See unique values of multiple columns
unique_values_multi = df[['property_type', 'subproperty_type', 'region', 'province', 'locality', 'equipped_kitchen', 'state_building', 'epc', 'heating_type']].apply(lambda x: x.unique())
print("\nUnique values in columns:\n", unique_values_multi)


In [None]:
# See unique values of multiple columns
columns = df[['property_type', 'subproperty_type', 'region', 'province', 'locality', 'equipped_kitchen', 'state_building', 'epc', 'heating_type']]
for column in columns:
    multi_columns = df[column].unique() 
    print(f"Unique values in column '{column}': {multi_columns}")

### dtypes

In [None]:
# Print dtypes
df.info()
# Select datatypes
data_object = df.select_dtypes(include='object')
data_float = df.select_dtypes(include='float64')
data_int = df.select_dtypes(include='int64')

In [None]:
data = df.select_dtypes(include='object')

for column in data.columns:
    unique_values = data[column].unique()
    print(f"Unique values in column '{column}' is : {unique_values}")

In [None]:
data = df.select_dtypes(include='float64')

for column in data.columns:
    unique_values = data[column].unique()
    print(f"Unique values in column '{column}' is : {unique_values}")

In [None]:
data = df.select_dtypes(include='int64')

for column in data.columns:
    unique_values = data[column].unique()
    print(f"Unique values in column '{column}' is : {unique_values}")

## Remove rows with missing values

In [None]:
# Function to drop observations with missing values
def drop_missing_values(df):
    """
    Drop observations with missing values in specific columns.
    Parameters:
    - df: pandas DataFrame, the DataFrame to be cleaned
    Returns:
    - pandas DataFrame, the cleaned DataFrame
    """
    # Drop missing observations for columns 'province', 'region', 'locality'
    df_cleaned = df.dropna(subset=['province', 'region', 'locality'])

    # Check if there are any missing values left in the columns
    if df_cleaned['province'].isnull().any() or df_cleaned['region'].isnull().any() or df_cleaned['locality'].isnull().any():
        print("Warning: Missing values still present after dropping.")
    else:
        print("Missing values were successfully dropped.")

    return df_cleaned

# Name the dataframe
df = drop_missing_values(df)
display(df.head())


## Remove outliers

**Handle outliers per column**

In [None]:
def handle_outliers(df, columns_to_remove_outliers, zscore_threshold=3):
    """
    Handle outliers in specified columns of a DataFrame.
    Parameters:
        df (DataFrame): Input DataFrame.
        columns_to_remove_outliers (list): List of column names to remove outliers.
        zscore_threshold (float): Z-score threshold for identifying outliers. Default is 3.
    Returns:
        DataFrame: DataFrame with outliers removed.
    """
    # Create a copy of the DataFrame to avoid modifying the original DataFrame
    df_outliers = df.copy()

    # Iterate through each column in columns_to_remove_outliers
    for column_name in columns_to_remove_outliers:
        # Convert the column to a numeric data type, ignoring errors
        df_outliers[column_name] = pd.to_numeric(df_outliers[column_name], errors='coerce')

        # Calculate Z-scores for the specified column
        z_scores = (df_outliers[column_name] - df_outliers[column_name].mean()) / df_outliers[column_name].std()

        # Create a mask to identify outliers
        outlier_mask = np.abs(z_scores) > zscore_threshold

        # Print the columns for which outliers are being removed
        print("Removing outliers for column: " + column_name)

        # Count missing values
        missing_values_count = df_outliers[column_name].isna().sum()
        print("Number of missing values in " + column_name + ": " + str(missing_values_count))

    return df_outliers

# Call the modified function
columns_to_remove_outliers = df.select_dtypes(exclude = 'object').columns
house_filtered = handle_outliers(df, columns_to_remove_outliers)

df.shape


## Write cleaned dataframe to csv

In [None]:
# Save the cleaned dataframe as çleaned_properties.csv
df.to_csv("../data/cleaned_properties.csv", index=False)