# Interesting cleaning
_This notebook is made by Katoo_

### Importing required libraries

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import LabelEncoder

### Loading the data

In [2]:
df = pd.read_csv('subset_2016.csv')

### Applying log transformations

In [3]:
# Apply log transformation to correct right-skewness
df['Price'] = np.log(df["Price"])

### Extracting year, month and day from the Date of Transfer column and dropping the original column

In [4]:
df['year'] = df['Date of Transfer'].str.split('-').str[0]
df['month'] = df['Date of Transfer'].str.split('-').str[1]
df['day'] = df['Date of Transfer'].str.split('-').str[2]
df.drop(columns='Date of Transfer', axis= 1, inplace=True)

### Renaming columns 

In [5]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

### Converting categorical variables

In [6]:
# Convert categorical variables
property_type = CategoricalDtype(categories=['D', 'S', 'T', 'F'], ordered=False)
df["property_type"] = df["property_type"].astype(property_type)
old_new = CategoricalDtype(categories=['Y', 'N'], ordered=False)
df["old/new"] = df["old/new"].astype(old_new)


### One-hot encoding for property_type and label encoding for old/new

In [7]:
# One-Hot Encoding for 'property_type'
df = pd.get_dummies(df, columns=['property_type'], prefix='property_type_is_', drop_first=False)
df[df.filter(like='property_type_is_').columns] = df.filter(like='property_type_is_').astype(int)

# Label Encoding for 'old/new'
encoder = LabelEncoder()
df['old/new'] = encoder.fit_transform(df['old/new'])

### Dropping unnecessary columns

In [8]:

df.drop(columns = 'transaction_unique_identifier', axis = 1, inplace = True)
df.drop(columns = 'duration', axis = 1, inplace = True)
df.drop(columns = 'ppdcategory_type', axis = 1, inplace = True)
df.drop(columns = 'record_status_-_monthly_file_only', axis = 1, inplace = True)

### Removing ourliers

In [9]:
# Remove outliers
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
higher = Q3 + 1.5 * IQR

df["price"] = df["price"].clip(lower=lower, upper=higher)


### Saving the final cleaned data

In [10]:
df.to_csv('cleaned_subset.csv', index=False)