In [53]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Function to preprocess data
def preprocess_data(df):
    # Select the required columns
    df = df[['City/Locality', 'BHK', 'Property Size (sqft)', 'Furnishing', 'Price (INR)']].reset_index(drop=True)

    # Convert Price to lakhs
    df['Price (INR)'] = df['Price (INR)'] / 100000

    # One-hot encode categorical variables
    df = pd.get_dummies(df, columns=['City/Locality', 'Furnishing'])

    return df

# Function to build and train the XGBoost model
def build_and_train_model(df):
    # Separate features and target variable
    X = df.drop(columns=['Price (INR)'])
    y = df['Price (INR)']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Build and train the XGBoost model
    model = XGBRegressor(random_state=42)
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate R-squared score
    r2 = r2_score(y_test, y_pred)
    print(f"R-squared score of the XGBoost model: {r2:.2f}")

    # Save the model to a file
    joblib.dump(model, 'xgboost_model.pkl')

    return model, X.columns

# Function to predict price using the trained model
def predict_price(location, bhk, furnishing, size, model, feature_columns):
    # Prepare input data for prediction
    input_data = {
        'BHK': [bhk],
        'Property Size (sqft)': [size]
    }
    for col in feature_columns:
        if 'City/Locality_' in col:
            input_data[col] = [1 if col == f'City/Locality_{location}' else 0]
        elif 'Furnishing_' in col:
            input_data[col] = [1 if col == f'Furnishing_{furnishing}' else 0]

    input_df = pd.DataFrame(input_data)

    # Align input_df with feature columns
    input_df = input_df.reindex(columns=feature_columns, fill_value=0)

    # Predict the price
    predicted_price = model.predict(input_df)[0]
    return predicted_price

# Import the CSV file
flats_df = pd.read_csv('/Users/guliaharsh021/Downloads/DA Documents /Projects/Project 1/Data Prepration, Processing and Analysis/Data Exploration and Cleaning/Cleaned Data/flats_data_cleaned.csv')

# Filter data for BHK >= 4
flats_df = flats_df[flats_df['BHK'] >= 4]

# Removing the outlires 
# The Inter Quartile Range for the column `Property Size (sqft)`
q1 = flats_df['Property Size (sqft)'].quantile(0.25)
q3 = flats_df['Property Size (sqft)'].quantile(0.75)
iqr = q3 - q1

# the upper and lower bounds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

flats_df = flats_df[(flats_df['Property Size (sqft)'] >= lower_bound) & (flats_df['Property Size (sqft)'] <= upper_bound)]

# The Inter Quartile Range for the column `Price (INR)`
q1 = flats_df['Price (INR)'].quantile(0.25)
q3 = flats_df['Price (INR)'].quantile(0.75)
iqr = q3 - q1

# the upper and lower bounds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

flats_df = flats_df[(flats_df['Price (INR)'] >= lower_bound) & (flats_df['Price (INR)'] <= upper_bound)]

# Preprocess the data
processed_df = preprocess_data(flats_df)

# Build and train the XGBoost model
model, feature_columns = build_and_train_model(processed_df)

R-squared score of the XGBoost model: 0.54


In [59]:
# Example usage of the prediction function
location = 'Dwarka, South West Delhi'  # Change this to the desired location
bhk = 3
furnishing = 'Unfurnished'
size = 1600
predicted_price = predict_price(location, bhk, furnishing, size, model, feature_columns)
print(f"Predicted Price for a {bhk} BHK, {furnishing} furnished property of size {size} sqft in {location}: {predicted_price:.2f} lakhs")

Predicted Price for a 3 BHK, Unfurnished furnished property of size 1600 sqft in Dwarka, South West Delhi: 222.31 lakhs


In [55]:
# Example usage of the prediction function
location = 'New Friends Colony, South East Delhi'  # Change this to the desired location
bhk = 4
furnishing = 'Semi-Furnished'
size = 2500
predicted_price = predict_price(location, bhk, furnishing, size, model, feature_columns)
print(f"Predicted Price for a {bhk} BHK, {furnishing} furnished property of size {size} sqft in {location}: {predicted_price:.2f} lakhs")

Predicted Price for a 4 BHK, Semi-Furnished furnished property of size 2500 sqft in New Friends Colony, South East Delhi: 412.88 lakhs


In [8]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Function to preprocess data
def preprocess_data(df):
    # Select the required columns
    df = df[['City/Locality', 'BHK', 'Property Size (sqft)', 'Furnishing', 'Price (INR)']].reset_index(drop=True)

    # Convert Price to lakhs
    df['Price (INR)'] = df['Price (INR)'] / 100000

    # One-hot encode categorical variables
    df = pd.get_dummies(df, columns=['City/Locality', 'Furnishing'])

    return df

# Function to build and train the XGBoost model
def build_and_train_model(df):
    # Separate features and target variable
    X = df.drop(columns=['Price (INR)'])
    y = df['Price (INR)']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

    # Build and train the XGBoost model
    model = XGBRegressor(random_state=0)
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate R-squared score
    r2 = r2_score(y_test, y_pred)
    print(f"R-squared score of the XGBoost model: {r2:.2f}")

    # Save the model to a file
    joblib.dump(model, 'xgboost_model.pkl')
    joblib.dump(X.columns, 'feature_columns.pkl')

    return model, X.columns

# Import the CSV file
flats_df = pd.read_csv('/Users/guliaharsh021/Downloads/DA Documents /Projects/Project-1/Data Prepration, Processing and Analysis/Data Exploration and Cleaning/Cleaned Data/flats_data_cleaned.csv')

# Filter data for BHK >= 4
flats_df = flats_df[flats_df['BHK'] >= 5]

# Removing the outlires 
# The Inter Quartile Range for the column `Property Size (sqft)`
q1 = flats_df['Property Size (sqft)'].quantile(0.25)
q3 = flats_df['Property Size (sqft)'].quantile(0.75)
iqr = q3 - q1

# the upper and lower bounds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

flats_df = flats_df[(flats_df['Property Size (sqft)'] >= lower_bound) & (flats_df['Property Size (sqft)'] <= upper_bound)]

# The Inter Quartile Range for the column `Price (INR)`
q1 = flats_df['Price (INR)'].quantile(0.25)
q3 = flats_df['Price (INR)'].quantile(0.75)
iqr = q3 - q1

# the upper and lower bounds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

flats_df = flats_df[(flats_df['Price (INR)'] >= lower_bound) & (flats_df['Price (INR)'] <= upper_bound)]

# Preprocess the data
processed_df = preprocess_data(flats_df)

# Build and train the XGBoost model
model, feature_columns = build_and_train_model(processed_df)

R-squared score of the XGBoost model: 0.74
