# 
<h1 style="font-size:300%; color:tomato; font-family:cursive;">Nepali House Price Prediction</h1>

The goal of this project is to learn about data pre-processing, feature engineering, and model building. I will be using the dataset from [Kaggle](https://www.kaggle.com/datasets/sagyamthapa/nepali-housing-price-dataset/data) which i felt is one of the challenging dataset which truly represents real-world noise. This notebook won’t explain the steps I take, as I have prepared a separate notebook for that.









# 1. Data Exploration

In [189]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ast

In [190]:
data = pd.read_csv('2020-4-27.csv') # read data from csv file

In [None]:
data.head(2)

In [None]:
print(f" Columns in dataset: \n{data.columns}. \n Number of columns: {len(data.columns)}")


In [None]:
data.info()

In [None]:
data.shape # shape of the data

In [None]:
data.index # index of the data

In [None]:
data.isna().sum() # check for missing values

In [None]:
data['Year'].corr(data['Price']) #As there was more missing values is "year " so checking how important is this column as our goal is yto predict the price

In [None]:
data.sort_values("Year", ascending=True).head(3)

In [None]:
data[data['Year'].isna() | data['Floors'].isna()][['Year','Floors']] # checking missing values

In [None]:
data['Year'].mean()

# 2. Data Cleaning and Transformations

In [201]:
year_constant = data['Year'].mean().round(0)


In [202]:
floor_constant = data['Floors'].mean().round(0) 

## 2.1. Handling Missing Values

In [203]:
#filling missing value of year wiht mean value

data['Year'] = data['Year'].fillna(year_constant)

In [204]:
# filling missing vlue of floor with mean value
data['Floors'] = data['Floors'].fillna(floor_constant)

In [205]:
# handling road type column by dropping it
data.drop('Road Type', axis=1, inplace=True) # droping column

In [None]:
data.isna().sum()

## 2.2 Dropping Unnecessary Columns

In [None]:
data.head(2)

In [208]:
temp = data.pop('Title') # poping out the title column which might be needed for resolving otyher missing values

In [209]:
columns_to_drop = ['Address', 'Views', 'Posted','Road']

In [210]:
data.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
data.head()

In [212]:
temp = set()

for amenities in data['Amenities']:
    # Remove the brackets and split by comma
    amenities_list = amenities.strip('[]').replace("'", "").split(',')
    # Strip whitespace and add each cleaned item to the set
    temp.update(item.strip() for item in amenities_list)

temp = list(temp)
temp = temp[1:]

## 2.3 Handling Ameneities columnn

In [213]:
data['Amenities'] = data['Amenities'].apply(ast.literal_eval)

# Create dummy variables for each unique amenity
amenities_dummies = pd.get_dummies(data['Amenities'].apply(pd.Series).stack()).groupby(level=0).sum()

# Concatenate the original dataframe with the dummy variables
data = pd.concat([data, amenities_dummies], axis=1)

data = data.drop('Amenities', axis=1)

## 2.4 Convert Road Width column to Meter

In [214]:
def convert_to_meters(value):
    value = value.strip().lower()
    if 'feet' in value:
        # Extract numeric part and convert to meters
        feet = float(value.replace('feet', '').strip())
        return round(feet * 0.3048, 2)  # Convert to meters and round to 2 decimal places
    elif 'meter' in value:
        # Extract numeric part (already in meters)
        meters = float(value.replace('meter', '').strip())
        return round(meters, 2)
    else:
        # Handle cases without units (assuming it's in feet and convert to meters)
        feet = float(value)
        return round(feet * 0.3048, 2)

# Apply the function to the 'Road Width' column
data['Road Width'] = data['Road Width'].apply(convert_to_meters)

## 2.5 Handling Face column

In [215]:
face_dummies= pd.get_dummies(data['Face']).astype(int)
data = pd.concat([data, face_dummies], axis=1)
data = data.drop('Face', axis=1)



## 2.6 Handling Area and Build Area column

In [216]:

import pandas as pd
import numpy as np
import re

def convert_to_aana(value):
    """Convert various area units to Aana"""
    # Constants for conversion
    ROPANI_TO_AANA = 16  # 1 Ropani = 16 Aana
    SQFT_TO_AANA = 0.00795  # 1 Sq. Feet ≈ 0.00795 Aana
    KATTHA_TO_AANA = 13.31  # 1 Kattha ≈ 13.31 Aana
    DHUR_TO_AANA = 0.83  # 1 Dhur ≈ 0.83 Aana
    BIGHA_TO_AANA = 399.3  # 1 Bigha ≈ 399.3 Aana
    
    if pd.isna(value) or value == 'dont know Sq. Feet':
        return np.nan
        
    value = str(value).strip().lower()
    
    # Handle direct Aana measurements
    if 'aana' in value and not any(unit in value for unit in ['ropani', 'sq. feet', 'kattha', 'dhur', 'bigha']):
        
        cleaned = value.replace('aana', '').replace('ana', '').strip()
        try:
            return float(cleaned)
        except ValueError:
            # Handle complex Aana format (e.g., "0-21-0-0 Aana")
            parts = cleaned.split('-')
            if len(parts) == 4:
                try:
                    total = (float(parts[0]) * 16) + float(parts[1]) + (float(parts[2]) / 4) + (float(parts[3]) / 16)
                    return total
                except ValueError:
                    return np.nan
    
    # Handle Ropani
    if 'ropani' in value:
        cleaned = value.replace('ropani', '').strip()
        try:
            return float(cleaned) * ROPANI_TO_AANA
        except ValueError:
            parts = cleaned.split('-')
            if len(parts) == 4:
                try:
                    total = (float(parts[0]) * ROPANI_TO_AANA) + float(parts[1]) + (float(parts[2]) / 4) + (float(parts[3]) / 16)
                    return total
                except ValueError:
                    return np.nan
    
    # Handle Square Feet
    if 'sq. feet' in value:
        try:
            number = float(re.findall(r'\d+', value)[0])
            return number * SQFT_TO_AANA
        except (ValueError, IndexError):
            return np.nan
    
    # Handle Kattha
    if 'kattha' in value:
        cleaned = value.replace('kattha', '').strip()
        try:
            return float(cleaned) * KATTHA_TO_AANA
        except ValueError:
            return np.nan
    
    # Handle Dhur
    if 'dhur' in value:
        cleaned = value.replace('dhur', '').strip()
        try:
            return float(cleaned) * DHUR_TO_AANA
        except ValueError:
            return np.nan
            
    # Handle Bigha
    if 'bigha' in value:
        cleaned = value.replace('bigha', '').strip()
        try:
            return float(cleaned) * BIGHA_TO_AANA
        except ValueError:
            return np.nan
            
    # Handle Haat (convert to sq feet first, then to Aana)
    if 'haat' in value:
        try:
            dimensions = re.findall(r'\d+', value)
            if len(dimensions) == 2:
                sq_feet = float(dimensions[0]) * float(dimensions[1])
                return sq_feet * SQFT_TO_AANA
        except (ValueError, IndexError):
            return np.nan
    
    return np.nan


# Create new columns for cleaned values
data['Area'] = data['Area'].apply(convert_to_aana)
    


In [None]:
data.drop('Build Area', axis=1, inplace=True)

In [226]:
data['Area'] = data['Area'].fillna(0)

In [None]:
data = data[data['Area'] != 0]

## 2.7 Handling Price column

In [241]:
# Calculate Q1, Q3, and IQR
Q1 = data['Price'].quantile(0.25)
Q3 = data['Price'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier range
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = data[(data['Price'] < lower_bound) | (data['Price'] > upper_bound)]
print("Outliers:\n", len(outliers['Price']))



Outliers:
 0


In [None]:
data['Price'] = data['Price'].clip(lower=lower_bound, upper=upper_bound)

In [240]:
data

Unnamed: 0,City,Price,Bedroom,Bathroom,Floors,Parking,Year,Area,Road Width,Air Condition,...,Water Well,Wifi,East,North,North East,North West,South,South East,South West,West
0,Kathmandu,65550000,6,3,2.0,10,2073.0,16.00,6.10,0.0,...,1.0,1.0,0,0,0,0,0,0,0,1
1,Kathmandu,65550000,5,3,2.0,9,2073.0,21.00,6.10,0.0,...,1.0,1.0,1,0,0,0,0,0,0,0
2,Kathmandu,65550000,5,3,2.0,12,2071.0,17.00,6.10,0.0,...,1.0,1.0,1,0,0,0,0,0,0,0
3,Kathmandu,65550000,6,4,3.0,9,2017.0,10.25,6.10,0.0,...,1.0,1.0,0,0,0,0,0,0,0,1
4,Kathmandu,65550000,6,3,2.0,10,2070.0,19.50,6.10,0.0,...,1.0,1.0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2206,Kathmandu,500000,5,5,3.0,5,2058.0,16.00,4.88,0.0,...,0.0,0.0,0,0,1,0,0,0,0,0
2207,Kathmandu,12000000,4,2,2.0,1,2058.0,2.50,3.96,0.0,...,0.0,0.0,0,0,1,0,0,0,0,0
2208,Kathmandu,27000000,5,3,2.0,1,2071.0,8.00,3.66,0.0,...,0.0,0.0,1,0,0,0,0,0,0,0
2209,Kathmandu,300000,9,3,2.0,4,2058.0,51.00,3.96,0.0,...,0.0,0.0,0,1,0,0,0,0,0,0
