In [2]:
import pandas as pd
sale=pd.read_csv('merged_sale_housing_data.csv')
sale.head()

Unnamed: 0,Title (Address),Area (sq-ft),Price (Cr)
0,2 BHK Flat for Sale in Promont By Sheth Realty...,684 sqft,₹\r\n2.07 Cr
1,"2 BHK Flat for Sale in The Dynamix Luma, Andhe...",735 sqft,₹\r\n2.50 Cr
2,"2 BHK Flat for Sale in Dream Aspire, Andheri W...",713 sqft,₹\r\n2.38 Cr
3,"2 BHK Flat for Sale in Vihang Luxuria, Mira Ro...",684 sqft,₹\r\n1.21 Cr
4,"2 BHK Flat for Sale in RNA NG Grand Empire, Na...",580 sqft,₹\r\n1.11 Cr


In [2]:
sale.shape

(4800, 3)

In [3]:
def convert_to_sqft(area):
    try:
        parts = area.split()  
        if len(parts) != 2:
            raise ValueError("Malformed input")  
        
        value, unit = parts
        value = float(value) 
        
        # Conversion logic
        if unit == 'sqm':
            return round(value * 10.7639)  # Convert sqm to sq-ft
        elif unit == 'sqyrd':
            return round(value * 9)  # Convert sqyrd to sq-ft
        elif unit == 'sqft':
            return round(value)  # Already in sq-ft
        else:
            raise ValueError(f"Unknown unit: {unit}")  
    except Exception:
        return None  

sale['Area (sq-ft)'] = sale['Area (sq-ft)'].apply(convert_to_sqft)
sale.dropna(subset=['Area (sq-ft)'], inplace=True)  


In [4]:
sale.isnull().sum()

Title (Address)    0
Area (sq-ft)       0
Price (Cr)         0
dtype: int64

In [6]:
sale.shape

(4789, 3)

In [3]:
sale.head()

Unnamed: 0,Title (Address),Area (sq-ft),Price (Cr)
0,2 BHK Flat for Sale in Promont By Sheth Realty...,684 sqft,₹\r\n2.07 Cr
1,"2 BHK Flat for Sale in The Dynamix Luma, Andhe...",735 sqft,₹\r\n2.50 Cr
2,"2 BHK Flat for Sale in Dream Aspire, Andheri W...",713 sqft,₹\r\n2.38 Cr
3,"2 BHK Flat for Sale in Vihang Luxuria, Mira Ro...",684 sqft,₹\r\n1.21 Cr
4,"2 BHK Flat for Sale in RNA NG Grand Empire, Na...",580 sqft,₹\r\n1.11 Cr


In [8]:
def convert_price(price):
    try:
        price = price.replace('₹\r\n', '').strip()  
        if 'Cr' in price:
            value = float(price.replace('Cr', '').strip()) 
            return int(value * 10000000) 
        elif 'Lac' in price:
            value = float(price.replace('Lac', '').strip())  
            return int(value * 100000)
        else:
            raise ValueError("Unknown format") 
    except Exception as e:
        print(f"Error processing price: {price}, Error: {e}")
        return None  

sale['Price (Cr)'] = sale['Price (Cr)'].apply(convert_price)


In [9]:
sale.head(5)

Unnamed: 0,Title (Address),Area (sq-ft),Price (Cr)
0,2 BHK Flat for Sale in Promont By Sheth Realty...,684.0,20700000
1,"2 BHK Flat for Sale in The Dynamix Luma, Andhe...",735.0,25000000
2,"2 BHK Flat for Sale in Dream Aspire, Andheri W...",713.0,23800000
3,"2 BHK Flat for Sale in Vihang Luxuria, Mira Ro...",684.0,12100000
4,"2 BHK Flat for Sale in RNA NG Grand Empire, Na...",580.0,11100000


In [10]:
sale.isnull().sum()

Title (Address)    0
Area (sq-ft)       0
Price (Cr)         0
dtype: int64

In [11]:
def extract_bhk_and_clean_title(title):
    import re
    bhk_match = re.search(r'(\d+)\s+BHK', title)  
    if bhk_match:
        bhk = int(bhk_match.group(1)) 
        title = re.sub(r'\d+\s+BHK', '', title).strip()  
        return bhk, title
    return None, title 

sale[['bhk', 'Title (Address)']] = sale['Title (Address)'].apply(
    lambda x: pd.Series(extract_bhk_and_clean_title(x))
)

In [12]:
sale.head()

Unnamed: 0,Title (Address),Area (sq-ft),Price (Cr),bhk
0,Flat for Sale in Promont By Sheth Realty and A...,684.0,20700000,2
1,"Flat for Sale in The Dynamix Luma, Andheri Eas...",735.0,25000000,2
2,"Flat for Sale in Dream Aspire, Andheri West, M...",713.0,23800000,2
3,"Flat for Sale in Vihang Luxuria, Mira Road Eas...",684.0,12100000,2
4,"Flat for Sale in RNA NG Grand Empire, Navghar ...",580.0,11100000,2


In [13]:
a=sale['bhk'].unique()
print(a)

[2 3 4 5 1]


In [14]:
def extract_city(address):
    parts = address.split(",")
    city = parts[-1].strip() if parts else ""
    
    if city == "New Delhi":
        return "New Delhi"
    elif city == "Delhi Ncr":
        return "Delhi Ncr"
    
    return city.split()[-1] 

sale['city'] = sale['Title (Address)'].apply(extract_city)



In [15]:
sale.head()

Unnamed: 0,Title (Address),Area (sq-ft),Price (Cr),bhk,city
0,Flat for Sale in Promont By Sheth Realty and A...,684.0,20700000,2,Mumbai
1,"Flat for Sale in The Dynamix Luma, Andheri Eas...",735.0,25000000,2,Mumbai
2,"Flat for Sale in Dream Aspire, Andheri West, M...",713.0,23800000,2,Mumbai
3,"Flat for Sale in Vihang Luxuria, Mira Road Eas...",684.0,12100000,2,Mumbai
4,"Flat for Sale in RNA NG Grand Empire, Navghar ...",580.0,11100000,2,Mumbai


In [16]:
a=sale['city'].unique()
print(a)

['Mumbai' 'Pune' 'New Delhi' 'Delhi' 'Bhopal' 'Chennai' 'Bangalore'
 'Kolkata' 'Hyderabad' 'Ahmedabad' 'Surat' 'Kanpur' 'Goa' 'Indore']


In [17]:
sale.rename(columns={'Title (Address)': 'Address'}, inplace=True)
sale.rename(columns={'Area (sq-ft)': 'Area'}, inplace=True)
sale.rename(columns={'Price (Cr)': 'Price'}, inplace=True)


In [18]:
sale.head()

Unnamed: 0,Address,Area,Price,bhk,city
0,Flat for Sale in Promont By Sheth Realty and A...,684.0,20700000,2,Mumbai
1,"Flat for Sale in The Dynamix Luma, Andheri Eas...",735.0,25000000,2,Mumbai
2,"Flat for Sale in Dream Aspire, Andheri West, M...",713.0,23800000,2,Mumbai
3,"Flat for Sale in Vihang Luxuria, Mira Road Eas...",684.0,12100000,2,Mumbai
4,"Flat for Sale in RNA NG Grand Empire, Navghar ...",580.0,11100000,2,Mumbai


In [19]:
sale['FirstField'] = sale['Address'].str.split(' ').str[0]

unique_first_fields = sale['FirstField'].unique()

print("Unique First Fields:", unique_first_fields)


Unique First Fields: ['Flat' 'Apartment' 'Builder' 'Villa' 'Penthouse' 'House']


In [20]:
sale.rename(columns={'FirstField': 'Property_Type'}, inplace=True)

In [21]:
sale.head()

Unnamed: 0,Address,Area,Price,bhk,city,Property_Type
0,Flat for Sale in Promont By Sheth Realty and A...,684.0,20700000,2,Mumbai,Flat
1,"Flat for Sale in The Dynamix Luma, Andheri Eas...",735.0,25000000,2,Mumbai,Flat
2,"Flat for Sale in Dream Aspire, Andheri West, M...",713.0,23800000,2,Mumbai,Flat
3,"Flat for Sale in Vihang Luxuria, Mira Road Eas...",684.0,12100000,2,Mumbai,Flat
4,"Flat for Sale in RNA NG Grand Empire, Navghar ...",580.0,11100000,2,Mumbai,Flat


In [22]:
# Model Training

In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import pickle
from flask import Flask, request, jsonify


In [24]:
label_encoder_city = LabelEncoder()
sale['city'] = label_encoder_city.fit_transform(sale['city'])

label_encoder_property_type = LabelEncoder()
sale['Property_Type'] = label_encoder_property_type.fit_transform(sale['Property_Type'])


In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=100)
address_tfidf = tfidf_vectorizer.fit_transform(sale['Address'])
address_df = pd.DataFrame(address_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
sale = pd.concat([sale, address_df], axis=1)


In [None]:
sale.drop(['Address'], axis=1, inplace=True)

In [25]:
sale.shape

(4800, 105)

In [26]:
sale.isnull().sum()

Area             11
Price            11
bhk              11
city             11
Property_Type    11
                 ..
villa            11
village          11
wadala           11
wakad            11
west             11
Length: 105, dtype: int64

In [27]:
sale=sale.dropna()

In [28]:
sale.shape

(4778, 105)

In [29]:
X = sale.drop('Price', axis=1)
y = sale['Price']
print(f"X shape: {X.shape}, y shape: {y.shape}")  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models
models = {
    "LinearRegression": LinearRegression(),
    "RandomForestRegressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=42),
}

best_model = None
best_score = -float('inf')  


for name, model in models.items():
    model.fit(X_train, y_train)
    
    r2 = model.score(X_test, y_test)
    print(f"{name} R² score: {r2:.4f}")
    
    
    if r2 > best_score:
        best_score = r2
        best_model = model

# Save the best performing model
with open(f'{best_model.__class__.__name__}_best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Save LabelEncoders and TF-IDF Vectorizer
with open('sale_label_encoder_city.pkl', 'wb') as f:
    pickle.dump(label_encoder_city, f)

with open('sale_label_encoder_property_type.pkl', 'wb') as f:
    pickle.dump(label_encoder_property_type, f)

with open('sale_tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

print(f"Best model ({best_model.__class__.__name__}) saved as pickle file.")
print("Preprocessing and model training complete. Pickle files saved.")


X shape: (4778, 104), y shape: (4778,)
LinearRegression R² score: 0.3424
RandomForestRegressor R² score: 0.5689
DecisionTreeRegressor R² score: 0.4113
Best model (RandomForestRegressor) saved as pickle file.
Preprocessing and model training complete. Pickle files saved.
