In [3]:

import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
import re



In [2]:
rent = pd.read_csv('merged_rent_housing_data.csv')

In [4]:
rent.head()

Unnamed: 0,Title (Address),BathRoom,Price (Cr),Furnished
0,"2 BHK Flat for Rent in Siolim Plot, Siolim, Goa",1.0,"₹\r\n40,000",Furnished
1,"2 BHK Flat for Rent in Caranzalem, Goa",1.0,"₹\r\n35,000",Furnished
2,"2 BHK Flat for Rent in Laguna Azul, Vasco Da G...",1.0,"₹\r\n45,000",Furnished
3,2 BHK Villa for Rent in Revora Goa,1.0,"₹\r\n65,000",Furnished
4,"3 BHK Flat for Rent in Kamat Riviera, Caranzal...",1.0,"₹\r\n90,000",Furnished


In [9]:


def process_price(price):
    if not isinstance(price, str):
        return None
    price = price.replace('₹\r\n', '').strip()
    if 'Lac' in price:
        return float(price.replace('Lac', '').strip()) * 100000
    elif price.replace(',', '').strip().isnumeric():
        return float(price.replace(',', '').strip())
    return None

rent['Price_Numerical'] = rent['Price (Cr)'].apply(process_price)

rent = rent.drop(['Price (Cr)'], axis=1)

def extract_bhk(address):
    match = re.search(r'(\d+)\s*BHK', str(address))
    return int(match.group(1)) if match else None

if 'Title (Address)' in rent.columns:
    rent['bhk'] = rent['Title (Address)'].apply(extract_bhk)
else:
    print("Column 'Title (Address)' not found!")


def extract_city(address):
    parts = address.split(",")
    city = parts[-1].strip() if parts else ""
   
    if city == "New Delhi":
        return "New Delhi"
    elif city == "Delhi Ncr":
        return "Delhi Ncr"
    
    return city.split()[-1]

rent['city'] = rent['Title (Address)'].apply(extract_city)

# Drop rows with missing essential data
rent = rent.dropna()


# Encode the 'Furnished' column
furnished_encoder = LabelEncoder()
rent['Furnished_Numerical'] = furnished_encoder.fit_transform(rent['Furnished'])

# Fill missing Bathroom values
rent['BathRoom'] = rent.apply(
    lambda row: row['bhk'] if pd.isnull(row['BathRoom']) else row['BathRoom'],
    axis=1
)
rent.rename(columns={'Title (Address)': 'Address'}, inplace=True)


# TF-IDF Vectorizer for 'Address'
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
address_tfidf = tfidf_vectorizer.fit_transform(rent['Address'])
address_tfidf_df = pd.DataFrame(
    address_tfidf.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out(),
    index=rent.index
)

# One-hot encode the 'city' column
city_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
city_encoded = city_encoder.fit_transform(rent[['city']])
city_encoded_df = pd.DataFrame(
    city_encoded,
    columns=city_encoder.get_feature_names_out(['city']),
    index=rent.index
)

# Weightage for city and bhk
CITY_WEIGHT = 8
BHK_WEIGHT = 5


city_encoded_df_weighted = city_encoded_df * CITY_WEIGHT

rent['bhk_weighted'] = rent['bhk'] * BHK_WEIGHT

# Combine weighted and other features
X = pd.concat([
    rent[['bhk_weighted', 'BathRoom', 'Furnished_Numerical']],
    address_tfidf_df,
    city_encoded_df_weighted
], axis=1)
y = rent['Price_Numerical']

# Drop rows with missing data in X or y
X = X.dropna()
y = y.loc[X.index]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model selection and training
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

best_model = None
best_r2 = -float('inf')
best_model_name = ''

for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    r2 = r2_score(y_test, y_pred)
    print(f'{model_name} R²: {r2:.4f}')
    
    if r2 > best_r2:
        best_r2 = r2
        best_model = model
        best_model_name = model_name

print(f'Best model: {best_model_name} with R²: {best_r2:.4f}')

# Save the best model, scaler, vectorizer, and encoders
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

with open('city_encoder.pkl', 'wb') as f:
    pickle.dump(city_encoder, f)

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(furnished_encoder, f)

print("Model and encoders saved successfully.")



Random Forest R²: 0.4393
Linear Regression R²: 0.4023
Decision Tree R²: 0.3331
Gradient Boosting R²: 0.4463
Best model: Gradient Boosting with R²: 0.4463
Model and encoders saved successfully.
