In [4]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load and prepare the dataset
file_path = 'realtor_dataset.csv'
df = pd.read_csv(file_path)

# Clean the Rent column by removing non-numeric characters
df['Rent'] = df['Rent'].astype(str).apply(lambda x: re.sub(r'\D', '', x))
df['Rent'] = pd.to_numeric(df['Rent'], errors='coerce')

# Remove rows where 'Rent' is greater than 10,000,000 in "HSR Layout"
df = df[~((df['Location'] == 'HSR Layout') & (df['Rent'] > 10000000))]

# Define features and target
features = ['Location', 'No. of Bedroom', 'Parking', 'Furnishing Status', 'Gated Security', 'Total SqFt', 'Age of Building']
target = 'Rent'

# Drop rows where target is missing
df = df.dropna(subset=[target])

# Handle non-numeric 'Age of Building' by converting to a numeric value
df['Age of Building'] = df['Age of Building'].replace('Newly Constructed', 0)
df['Age of Building'] = pd.to_numeric(df['Age of Building'], errors='coerce')

# Handle missing values in features
numeric_features = ['Total SqFt', 'Age of Building']
categorical_features = ['Location', 'No. of Bedroom', 'Parking', 'Furnishing Status', 'Gated Security']

# Define preprocessor with imputation and one-hot encoding for categorical features
numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the Gradient Boosting model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=0)

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Split the data into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

# Function to make predictions
def predict_rent(input_data):
    return pipeline.predict(input_data)

# Example prediction
example_input = pd.DataFrame({
    'Location': ['HSR Layout'],
    'No. of Bedroom': ['2 Bedroom'],
    'Parking': ['Bike and Car'],
    'Furnishing Status': ['Fully Furnished'],
    'Gated Security': ['Yes'],
    'Total SqFt': [1200],
    'Age of Building': [5]
})

predicted_rent = predict_rent(example_input)
print(f'Predicted Rent: {predicted_rent[0]}')


Mean Absolute Error: 7809.854010850651
Predicted Rent: 35433.136530941745


In [51]:
#### final gradient boosting
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load and prepare the dataset
file_path = 'realtor_dataset.csv'
df = pd.read_csv(file_path)

# Clean the Rent column by removing non-numeric characters
df['Rent'] = df['Rent'].astype(str).apply(lambda x: re.sub(r'\D', '', x))
df['Rent'] = pd.to_numeric(df['Rent'], errors='coerce')

# Remove rows where 'Rent' is greater than 10,000,000 in "HSR Layout"
df = df[~((df['Location'] == 'HSR Layout') & (df['Rent'] > 10000000))]

# Define all features except 'Rent' and the target variable 'Rent'
features = df.columns.drop(['Rent'])  # Ensure 'Rent' is not included as a feature
target = 'Rent'

# Drop rows where target is missing
df = df.dropna(subset=[target])

# Handle non-numeric 'Age of Building' by converting to a numeric value
df['Age of Building'] = df['Age of Building'].replace('Newly Constructed', 0)
df['Age of Building'] = pd.to_numeric(df['Age of Building'], errors='coerce')

# Print columns to ensure consistency
print("Columns in the DataFrame:", df.columns)

# Handle missing values in features
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

# Remove 'Rent' from numeric features if it was mistakenly included
if 'Rent' in numeric_features:
    numeric_features.remove('Rent')

# Print out the features being used
print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

# Define preprocessor with imputation and one-hot encoding for categorical features
numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the Gradient Boosting model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=0)

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Split the data into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

# Function to make predictions with flexible input
def predict_rent(input_data):
    # Convert input data to DataFrame, fill missing columns with NaNs
    input_df = pd.DataFrame(input_data)
    missing_cols = [col for col in features if col not in input_df.columns]
    for col in missing_cols:
        input_df[col] = [None]
    
    # Reorder columns to match the training data
    input_df = input_df[features]
    
    # Predict using the trained pipeline
    return pipeline.predict(input_df)

# Example prediction with flexible input
example_input = {
    'Location': ['HSR Layout'],
    'No. of Bedroom': ['2 Bedroom'],
    'Parking': ['Bike and Car'],
    'Furnishing Status': ['Fully Furnished'],
    'Total SqFt': [1200]
    # You can omit or add any feature; the model will handle it
}

predicted_rent = predict_rent(example_input)
print(f'Predicted Rent: {predicted_rent[0]}')


Columns in the DataFrame: Index(['Age of Building', 'Water Supply', 'Rent', 'Transit Score',
       'Smoking Allowed', 'Location', 'Bathroom', 'Non-Veg Allowed',
       'Posted On', 'Parking', 'Property Type', 'Furnishing Status',
       'No. of Bedroom', 'Facing', 'Gated Security', 'Deposit',
       'Attached Bathroom', 'Livability Score', 'Balcony', 'URL',
       'Drinking Allowed', 'Possession', 'Total SqFt', 'AC', 'Floor',
       'Preferred Tenant', 'Room Type'],
      dtype='object')
Numeric features: ['Age of Building', 'Transit Score', 'Bathroom', 'Deposit', 'Livability Score', 'Total SqFt']
Categorical features: ['Water Supply', 'Smoking Allowed', 'Location', 'Non-Veg Allowed', 'Posted On', 'Parking', 'Property Type', 'Furnishing Status', 'No. of Bedroom', 'Facing', 'Gated Security', 'Attached Bathroom', 'Balcony', 'URL', 'Drinking Allowed', 'Possession', 'AC', 'Floor', 'Preferred Tenant', 'Room Type']




Mean Absolute Error: 6125.241901979707
Predicted Rent: 29749.76631694349




In [54]:
# Calculate the absolute errors
errors = abs(y_pred - y_test)

# Calculate the percentage of predictions with an error margin of less than 1000
percentage_within_1000 = (errors < 6000).mean() * 100

print(f'Percentage of predictions with error < 1000: {percentage_within_1000:.2f}%')


Percentage of predictions with error < 1000: 72.07%


In [45]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Load and prepare the dataset
file_path = 'realtor_dataset.csv'
df = pd.read_csv(file_path)

# Clean the Rent column by removing non-numeric characters
df['Rent'] = df['Rent'].astype(str).apply(lambda x: re.sub(r'\D', '', x))
df['Rent'] = pd.to_numeric(df['Rent'], errors='coerce')

# Remove rows where 'Rent' is greater than 10,000,000 in "HSR Layout"
df = df[~((df['Location'] == 'HSR Layout') & (df['Rent'] > 10000000))]

# Define all features except 'Rent' and the target variable 'Rent'
features = df.columns.drop(['Rent'])  # Ensure 'Rent' is not included as a feature
target = 'Rent'

# Drop rows where target is missing
df = df.dropna(subset=[target])

# Handle non-numeric 'Age of Building' by converting to a numeric value
df['Age of Building'] = df['Age of Building'].replace('Newly Constructed', 0)
df['Age of Building'] = pd.to_numeric(df['Age of Building'], errors='coerce')

# Handle missing values in features
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

# Remove 'Rent' from numeric features if it was mistakenly included
if 'Rent' in numeric_features:
    numeric_features.remove('Rent')

# Define preprocessor with imputation, scaling, and one-hot encoding
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Convert directly to dense
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the data
X = df[features]
y = df[target]
X = preprocessor.fit_transform(X)

# Convert to dense array if it's still sparse
X = X.toarray() if hasattr(X, 'toarray') else X

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Convert y_train to a NumPy array
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

# Define the neural network model
model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.05))
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.05))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.05))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(optimizer='adam', loss='mean_absolute_error')

# Train the model
history = model.fit(X_train, y_train, epochs=1, validation_split=0.2, batch_size=10, verbose=1)

# Evaluate the model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

# Function to make predictions with flexible input using the trained model
def predict_rent(input_data):
    input_df = pd.DataFrame(input_data)
    missing_cols = [col for col in features if col not in input_df.columns]
    for col in missing_cols:
        input_df[col] = [None]
    
    input_df = preprocessor.transform(input_df)
    input_df = input_df.toarray() if hasattr(input_df, 'toarray') else input_df
    return model.predict(input_df).flatten()[0]

# Example prediction with flexible input
example_input = {
    'Location': ['HSR Layout'],
    'No. of Bedroom': ['2 Bedroom'],
    'Parking': ['Bike and Car'],
    'Furnishing Status': ['Fully Furnished'],
    'Total SqFt': [1000]
}

predicted_rent = predict_rent(example_input)
print(f'Predicted Rent: {predicted_rent}')


Mean Absolute Error: 8523.488961658226
Predicted Rent: 10980.6396484375


In [22]:
# Example prediction with flexible input
example_input = {
    'Location': ['HSR Layout'],
    'No. of Bedroom': ['2 Bedroom'],
    'Parking': ['Bike and Car'],
    'Furnishing Status': ['Fully Furnished'],
    'Total SqFt': [1000]
}

predicted_rent = predict_rent(example_input)
print(f'Predicted Rent: {predicted_rent}')

Predicted Rent: 19797.6328125


In [46]:
### better neural net approach-currently testing
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Load and prepare the dataset
file_path = 'realtor_dataset.csv'
df = pd.read_csv(file_path)

# Clean the Rent column by removing non-numeric characters
df['Rent'] = df['Rent'].astype(str).apply(lambda x: re.sub(r'\D', '', x))
df['Rent'] = pd.to_numeric(df['Rent'], errors='coerce')

# Remove rows where 'Rent' is greater than 10,000,000 in "HSR Layout"
df = df[~((df['Location'] == 'HSR Layout') & (df['Rent'] > 10000000))]

# Define all features except 'Rent' and the target variable 'Rent'
features = df.columns.drop(['Rent'])  # Ensure 'Rent' is not included as a feature
target = 'Rent'

# Drop rows where target is missing
df = df.dropna(subset=[target])

# Handle non-numeric 'Age of Building' by converting to a numeric value
df['Age of Building'] = df['Age of Building'].replace('Newly Constructed', 0)
df['Age of Building'] = pd.to_numeric(df['Age of Building'], errors='coerce')

# Handle missing values in features
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

# Remove 'Rent' from numeric features if it was mistakenly included
if 'Rent' in numeric_features:
    numeric_features.remove('Rent')

# Define preprocessor with imputation, scaling, and one-hot encoding
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Convert directly to dense
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the data
X = df[features]
y = df[target]
X = preprocessor.fit_transform(X)

# Convert to dense array if it's still sparse
X = X.toarray() if hasattr(X, 'toarray') else X

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Convert y_train to a NumPy array
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

# Define the neural network model with adjusted learning rate
model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))

# Compile the model with a custom learning rate
learning_rate = 0.001  # Adjust this value to optimize training
optimizer = Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='mean_absolute_error')

# Train the model with adjusted batch size and epochs
history = model.fit(X_train, y_train, epochs=15, validation_split=0.2, batch_size=1, verbose=1)

# Evaluate the model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

# Function to make predictions with flexible input using the trained model
def predict_rent(input_data):
    input_df = pd.DataFrame(input_data)
    missing_cols = [col for col in features if col not in input_df.columns]
    for col in missing_cols:
        input_df[col] = [None]
    
    input_df = preprocessor.transform(input_df)
    input_df = input_df.toarray() if hasattr(input_df, 'toarray') else input_df
    return model.predict(input_df).flatten()[0]

# Example prediction with flexible input
example_input = {
    'Location': ['HSR Layout'],
    'No. of Bedroom': ['2 Bedroom'],
    'Parking': ['Bike and Car'],
    'Furnishing Status': ['Fully Furnished'],
    'Total SqFt': [1000]
}

predicted_rent = predict_rent(example_input)
print(f'Predicted Rent: {predicted_rent}')


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Mean Absolute Error: 6804.783410432992
Predicted Rent: 14925.326171875


In [50]:
# Calculate the absolute errors
errors = abs(y_pred - y_test)

# Calculate the percentage of predictions with an error margin of less than 1000
percentage_within_1000 = (errors > 6000).mean() * 100

print(f'Percentage of predictions with error < 1000: {percentage_within_1000:.2f}%')

Percentage of predictions with error < 1000: 72.21%


In [31]:
# Filter the dataset for HSR Layout
hsr_data = df[df['Location'] == 'HSR Layout']

# Group by 'No. of Bedroom' and calculate the mean rent
average_rent_by_bhk = hsr_data.groupby('No. of Bedroom')['Rent'].mean()

print(average_rent_by_bhk)


No. of Bedroom
1 Bedroom     17532.032051
2 Bedroom     29876.262626
3 Bedroom     58972.222222
4 Bedroom    176200.000000
Name: Rent, dtype: float64


In [55]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load and prepare the dataset
file_path = 'realtor_dataset.csv'
df = pd.read_csv(file_path)

# Clean the Rent column by removing non-numeric characters
df['Rent'] = df['Rent'].astype(str).apply(lambda x: re.sub(r'\D', '', x))
df['Rent'] = pd.to_numeric(df['Rent'], errors='coerce')

# Remove rows where 'Rent' is greater than 10,000,000 in "HSR Layout"
df = df[~((df['Location'] == 'HSR Layout') & (df['Rent'] > 10000000))]

# Define all features except 'Rent' and the target variable 'Rent'
features = df.columns.drop(['Rent'])  # Ensure 'Rent' is not included as a feature
target = 'Rent'

# Drop rows where target is missing
df = df.dropna(subset=[target])

# Handle non-numeric 'Age of Building' by converting to a numeric value
df['Age of Building'] = df['Age of Building'].replace('Newly Constructed', 0)
df['Age of Building'] = pd.to_numeric(df['Age of Building'], errors='coerce')

# Print columns to ensure consistency
print("Columns in the DataFrame:", df.columns)

# Handle missing values in features
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

# Remove 'Rent' from numeric features if it was mistakenly included
if 'Rent' in numeric_features:
    numeric_features.remove('Rent')

# Print out the features being used
print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

# Define preprocessor with imputation and one-hot encoding for categorical features
numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Split the data into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

# Function to make predictions with flexible input
def predict_rent(input_data):
    # Convert input data to DataFrame, fill missing columns with NaNs
    input_df = pd.DataFrame(input_data)
    missing_cols = [col for col in features if col not in input_df.columns]
    for col in missing_cols:
        input_df[col] = [None]
    
    # Reorder columns to match the training data
    input_df = input_df[features]
    
    # Predict using the trained pipeline
    return pipeline.predict(input_df)

# Example prediction with flexible input
example_input = {
    'Location': ['HSR Layout'],
    'No. of Bedroom': ['2 Bedroom'],
    'Parking': ['Bike and Car'],
    'Furnishing Status': ['Fully Furnished'],
    'Total SqFt': [1200]
    # You can omit or add any feature; the model will handle it
}

predicted_rent = predict_rent(example_input)
print(f'Predicted Rent: {predicted_rent[0]}')


Columns in the DataFrame: Index(['Age of Building', 'Water Supply', 'Rent', 'Transit Score',
       'Smoking Allowed', 'Location', 'Bathroom', 'Non-Veg Allowed',
       'Posted On', 'Parking', 'Property Type', 'Furnishing Status',
       'No. of Bedroom', 'Facing', 'Gated Security', 'Deposit',
       'Attached Bathroom', 'Livability Score', 'Balcony', 'URL',
       'Drinking Allowed', 'Possession', 'Total SqFt', 'AC', 'Floor',
       'Preferred Tenant', 'Room Type'],
      dtype='object')
Numeric features: ['Age of Building', 'Transit Score', 'Bathroom', 'Deposit', 'Livability Score', 'Total SqFt']
Categorical features: ['Water Supply', 'Smoking Allowed', 'Location', 'Non-Veg Allowed', 'Posted On', 'Parking', 'Property Type', 'Furnishing Status', 'No. of Bedroom', 'Facing', 'Gated Security', 'Attached Bathroom', 'Balcony', 'URL', 'Drinking Allowed', 'Possession', 'AC', 'Floor', 'Preferred Tenant', 'Room Type']




Mean Absolute Error: 6184.012212028542
Predicted Rent: 30060.0




In [116]:
### random forest best
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Load and prepare the dataset
file_path = 'realtor_dataset.csv'
df = pd.read_csv(file_path)

# Clean the Rent column by removing non-numeric characters
df['Rent'] = df['Rent'].astype(str).apply(lambda x: re.sub(r'\D', '', x))
df['Rent'] = pd.to_numeric(df['Rent'], errors='coerce')

# Remove rows where 'Rent' is greater than 10,000,000 in "HSR Layout"
df = df[~((df['Location'] == 'HSR Layout') & (df['Rent'] > 10000000))]

# Drop rows where target is missing
df = df.dropna(subset=['Rent'])

# Handle non-numeric 'Age of Building' by converting to a numeric value
df['Age of Building'] = df['Age of Building'].replace('Newly Constructed', 0)
df['Age of Building'] = pd.to_numeric(df['Age of Building'], errors='coerce')

# Add the Cost_per_SqFt column
df['Cost_per_SqFt'] = df['Rent'] / df['Total SqFt']

# Replace infinite values that might arise from division by zero
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Handle missing values in Cost_per_SqFt
df['Cost_per_SqFt'] = df['Cost_per_SqFt'].fillna(df['Cost_per_SqFt'].median())

# Handle missing values in other features
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

# Remove 'Rent' from numeric features if it was mistakenly included
if 'Rent' in numeric_features:
    numeric_features.remove('Rent')

# Define preprocessor with imputation, scaling, and one-hot encoding
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the data
X = df.drop(columns=['Rent'])  # Features include the new Cost_per_SqFt column
y = df['Rent']
X = preprocessor.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train a Random Forest model
rf_model = RandomForestRegressor(random_state=0)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')


Mean Absolute Error: 1872.6288379204893


In [88]:
### best using gradient boosting
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

# Load and prepare the dataset
file_path = 'realtor_dataset.csv'
df = pd.read_csv(file_path)

# Clean the Rent column by removing non-numeric characters
df['Rent'] = df['Rent'].astype(str).apply(lambda x: re.sub(r'\D', '', x))
df['Rent'] = pd.to_numeric(df['Rent'], errors='coerce')

# Remove rows where 'Rent' is greater than 10,000,000 in "HSR Layout"
df = df[~((df['Location'] == 'HSR Layout') & (df['Rent'] > 10000000))]

# Drop rows where target is missing
df = df.dropna(subset=['Rent'])

# Handle non-numeric 'Age of Building' by converting to a numeric value
df['Age of Building'] = df['Age of Building'].replace('Newly Constructed', 0)
df['Age of Building'] = pd.to_numeric(df['Age of Building'], errors='coerce')

# Add the Cost_per_SqFt column
df['Cost_per_SqFt'] = df['Rent'] / df['Total SqFt']

# Replace infinite values that might arise from division by zero
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Handle missing values in Cost_per_SqFt
df['Cost_per_SqFt'] = df['Cost_per_SqFt'].fillna(df['Cost_per_SqFt'].median())

# Handle missing values in other features
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

# Remove 'Rent' from numeric features if it was mistakenly included
if 'Rent' in numeric_features:
    numeric_features.remove('Rent')

# Define preprocessor with imputation, scaling, and one-hot encoding
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the data
X = df.drop(columns=['Rent'])  # Features include the new Cost_per_SqFt column
y = df['Rent']
X = preprocessor.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train a Gradient Boosting model
gb_model = GradientBoostingRegressor(random_state=0)
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')


Mean Absolute Error: 2193.2871066459247


In [98]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Load and prepare the dataset
file_path = 'realtor_dataset.csv'
df = pd.read_csv(file_path)

# Clean the Rent column by removing non-numeric characters
df['Rent'] = df['Rent'].astype(str).apply(lambda x: re.sub(r'\D', '', x))
df['Rent'] = pd.to_numeric(df['Rent'], errors='coerce')

# Remove rows where 'Rent' is greater than 10,000,000 in "HSR Layout"
df = df[~((df['Location'] == 'HSR Layout') & (df['Rent'] > 10000000))]

# Drop rows where target is missing
df = df.dropna(subset=['Rent'])

# Handle non-numeric 'Age of Building' by converting to a numeric value
df['Age of Building'] = df['Age of Building'].replace('Newly Constructed', 0)
df['Age of Building'] = pd.to_numeric(df['Age of Building'], errors='coerce')

# Add the Cost_per_SqFt column
df['Cost_per_SqFt'] = df['Rent'] / df['Total SqFt']

# Replace infinite values that might arise from division by zero
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Handle missing values in Cost_per_SqFt
df['Cost_per_SqFt'] = df['Cost_per_SqFt'].fillna(df['Cost_per_SqFt'].median())

# Handle missing values in other features
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

# Remove 'Rent' from numeric features if it was mistakenly included
if 'Rent' in numeric_features:
    numeric_features.remove('Rent')

# Ensure 'Cost_per_SqFt' is included in numeric features
if 'Cost_per_SqFt' not in numeric_features:
    numeric_features.append('Cost_per_SqFt')

# Define preprocessor with imputation, scaling, and one-hot encoding
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the data
X = df.drop(columns=['Rent'])  # Features include the new Cost_per_SqFt column
y = df['Rent']
X = preprocessor.fit_transform(X)

# Convert to dense array if it's still sparse
X = X.toarray() if hasattr(X, 'toarray') else X

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Convert y_train and y_test to NumPy arrays
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

# Define the neural network model
model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(optimizer='adam', loss='mean_absolute_error')

# Train the model
history = model.fit(X_train, y_train, epochs=16, validation_split=0.2, batch_size=10, verbose=1)

# Evaluate the model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

# Function to make predictions with flexible input using the trained model
def predict_rent(input_data):
    input_df = pd.DataFrame(input_data)
    missing_cols = [col for col in df.columns if col not in input_df.columns]
    for col in missing_cols:
        input_df[col] = [None]
    
    input_df = preprocessor.transform(input_df)
    input_df = input_df.toarray() if hasattr(input_df, 'toarray') else input_df
    return model.predict(input_df).flatten()[0]

# Example prediction with flexible input
example_input = {
    'Location': ['HSR Layout'],
    'No. of Bedroom': ['2 Bedroom'],
    'Parking': ['Bike and Car'],
    'Furnishing Status': ['Fully Furnished'],
    'Total SqFt': [1000],
    'Cost_per_SqFt': [1000/1000]  # Example value, should match calculated value
}

predicted_rent = predict_rent(example_input)
print(f'Predicted Rent: {predicted_rent}')


Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Mean Absolute Error: 3757.9175723014623
Predicted Rent: 108.45211791992188


In [102]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Load and prepare the dataset
file_path = 'realtor_dataset.csv'
df = pd.read_csv(file_path)

# Clean the Rent column by removing non-numeric characters
df['Rent'] = df['Rent'].astype(str).apply(lambda x: re.sub(r'\D', '', x))
df['Rent'] = pd.to_numeric(df['Rent'], errors='coerce')

# Remove rows where 'Rent' is greater than 10,000,000 in "HSR Layout"
df = df[~((df['Location'] == 'HSR Layout') & (df['Rent'] > 10000000))]

# Drop rows where target is missing
df = df.dropna(subset=['Rent'])

# Handle non-numeric 'Age of Building' by converting to a numeric value
df['Age of Building'] = df['Age of Building'].replace('Newly Constructed', 0)
df['Age of Building'] = pd.to_numeric(df['Age of Building'], errors='coerce')

# Add the Cost_per_SqFt column
df['Cost_per_SqFt'] = df['Rent'] / df['Total SqFt']

# Replace infinite values that might arise from division by zero
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Handle missing values in Cost_per_SqFt
df['Cost_per_SqFt'] = df['Cost_per_SqFt'].fillna(df['Cost_per_SqFt'].median())

# Handle missing values in other features
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

# Remove 'Rent' from numeric features if it was mistakenly included
if 'Rent' in numeric_features:
    numeric_features.remove('Rent')

# Ensure 'Cost_per_SqFt' is included in numeric features
if 'Cost_per_SqFt' not in numeric_features:
    numeric_features.append('Cost_per_SqFt')

# Update the pipeline to handle missing values in categorical features by imputing 'none'
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='none')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the data
X = df.drop(columns=['Rent'])  # Features include the new Cost_per_SqFt column
y = df['Rent']
X = preprocessor.fit_transform(X)

# Convert to dense array if it's still sparse
X = X.toarray() if hasattr(X, 'toarray') else X

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Convert y_train and y_test to NumPy arrays
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

# Define the neural network model
model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(optimizer='adam', loss='mean_absolute_error')

# Train the model
history = model.fit(X_train, y_train, epochs=20, validation_split=0.2, batch_size=10, verbose=1)

# Evaluate the model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

# Function to make predictions with flexible input using the trained model
def predict_rent(input_data):
    input_df = pd.DataFrame(input_data)
    missing_cols = [col for col in df.columns if col not in input_df.columns]
    for col in missing_cols:
        input_df[col] = [None]
    
    input_df = preprocessor.transform(input_df)
    input_df = input_df.toarray() if hasattr(input_df, 'toarray') else input_df
    return model.predict(input_df).flatten()[0]

# Example prediction with flexible input
example_input = {
    'Location': ['HSR Layout'],
    'No. of Bedroom': ['2 Bedroom'],
    'Parking': ['Bike and Car'],
    'Furnishing Status': ['Fully Furnished'],
    'Total SqFt': [1000],
    'Cost_per_SqFt': [1000/1000]  # Example value, should match calculated value
}

predicted_rent = predict_rent(example_input)
print(f'Predicted Rent: {predicted_rent}')


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Mean Absolute Error: 3784.0964843252264
Predicted Rent: 9.092270851135254


In [130]:
# Calculate the absolute errors
errors = abs(y_pred - y_test)

# Calculate the percentage of predictions with an error margin of less than 1000
percentage_within_1000 = (errors < 500).mean() * 100

print(f'Percentage of predictions with error < 1000: {percentage_within_1000:.2f}%')

Percentage of predictions with error < 1000: 67.58%


In [75]:
a=df['Property Type'].unique()

In [73]:
df.columns

Index(['Age of Building', 'Water Supply', 'Rent', 'Transit Score',
       'Smoking Allowed', 'Location', 'Bathroom', 'Non-Veg Allowed',
       'Posted On', 'Parking', 'Property Type', 'Furnishing Status',
       'No. of Bedroom', 'Facing', 'Gated Security', 'Deposit',
       'Attached Bathroom', 'Livability Score', 'Balcony', 'URL',
       'Drinking Allowed', 'Possession', 'Total SqFt', 'AC', 'Floor',
       'Preferred Tenant', 'Room Type'],
      dtype='object')

In [76]:
a

array(['Apartment', 'Independent House/villa', 'Gated Community',
       'Independent Floor/builder Floor', 'Standalone Building'],
      dtype=object)

In [77]:
df.isnull().sum()

Age of Building      4026
Water Supply         1868
Rent                    0
Transit Score         189
Smoking Allowed      4903
Location                0
Bathroom                1
Non-Veg Allowed         0
Posted On               0
Parking               444
Property Type           0
Furnishing Status       0
No. of Bedroom          1
Facing               2070
Gated Security          0
Deposit                 0
Attached Bathroom    4903
Livability Score      189
Balcony              2699
URL                     0
Drinking Allowed     4903
Possession              0
Total SqFt              0
AC                   4903
Floor                   0
Preferred Tenant        0
Room Type            4903
dtype: int64

In [80]:
filtered_data = df[(df['Location'] == 'HSR Layout') & (df['No. of Bedroom'] == '2 Bedroom')]

# Print the filtered data
print(filtered_data['Property Type'].unique())

['Independent House/villa' 'Apartment' 'Independent Floor/builder Floor']


In [104]:
df.isnull().sum()

Age of Building      4026
Water Supply         1868
Rent                    0
Transit Score         189
Smoking Allowed      4903
Location                0
Bathroom                1
Non-Veg Allowed         0
Posted On               0
Parking               444
Property Type           0
Furnishing Status       0
No. of Bedroom          1
Facing               2070
Gated Security          0
Deposit                 0
Attached Bathroom    4903
Livability Score      189
Balcony              2699
URL                     0
Drinking Allowed     4903
Possession              0
Total SqFt              0
AC                   4903
Floor                   0
Preferred Tenant        0
Room Type            4903
Cost_per_SqFt           0
dtype: int64

In [108]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

# Load and prepare the dataset
file_path = 'realtor_dataset.csv'
df = pd.read_csv(file_path)

# Clean the Rent column by removing non-numeric characters
df['Rent'] = df['Rent'].astype(str).apply(lambda x: re.sub(r'\D', '', x))
df['Rent'] = pd.to_numeric(df['Rent'], errors='coerce')

# Remove rows where 'Rent' is greater than 10,000,000 in "HSR Layout"
df = df[~((df['Location'] == 'HSR Layout') & (df['Rent'] > 10000000))]

# Drop rows where target or 'No. of Bedroom' is missing
df = df.dropna(subset=['Rent', 'No. of Bedroom'])

# Handle non-numeric 'Age of Building' by converting to a numeric value
df['Age of Building'] = df['Age of Building'].replace('Newly Constructed', 0)
df['Age of Building'] = pd.to_numeric(df['Age of Building'], errors='coerce')

# Add the Cost_per_SqFt column
df['Cost_per_SqFt'] = df['Rent'] / df['Total SqFt']

# Replace infinite values that might arise from division by zero
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Handle missing values in Cost_per_SqFt
df['Cost_per_SqFt'] = df['Cost_per_SqFt'].fillna(df['Cost_per_SqFt'].median())

# Drop columns with all missing values
df = df.dropna(axis=1, how='all')

# Custom Transformer for grouped imputation
class GroupImputer(BaseEstimator, TransformerMixin):
    def __init__(self, group_cols, impute_col):
        self.group_cols = group_cols
        self.impute_col = impute_col
        self.fill_values = {}

    def fit(self, X, y=None):
        self.fill_values = (
            X.groupby(self.group_cols)[self.impute_col]
            .agg(lambda x: x.value_counts().index[0] if not x.isnull().all() else np.nan)
            .to_dict()
        )
        return self

    def transform(self, X):
        for key, value in self.fill_values.items():
            mask = (X[self.group_cols[0]] == key[0]) & (X[self.group_cols[1]] == key[1])
            X.loc[mask, self.impute_col] = X.loc[mask, self.impute_col].fillna(value)
        return X

# Apply grouped imputation for categorical variables
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

for col in categorical_features:
    if col not in ['Location', 'No. of Bedroom']:  # Exclude grouping columns themselves
        imputer = GroupImputer(group_cols=['Location', 'No. of Bedroom'], impute_col=col)
        df = imputer.fit_transform(df)

# Additional step: Impute any remaining missing values in numeric columns
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
imputer_numeric = SimpleImputer(strategy='mean')
df[numeric_features] = imputer_numeric.fit_transform(df[numeric_features])

# Prepare features and target
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

# Remove 'Rent' from numeric features if it was mistakenly included
if 'Rent' in numeric_features:
    numeric_features.remove('Rent')

# Ensure 'Cost_per_SqFt' is included in numeric features
if 'Cost_per_SqFt' not in numeric_features:
    numeric_features.append('Cost_per_SqFt')

# Define preprocessor with scaling and one-hot encoding
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the data
X = df.drop(columns=['Rent'])  # Features include the new Cost_per_SqFt column
y = df['Rent']
X = preprocessor.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [143]:
#### Best version 1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

# Load and prepare the dataset
file_path = 'realtor_dataset.csv'
df = pd.read_csv(file_path)

# Clean the Rent column by removing non-numeric characters
df['Rent'] = df['Rent'].astype(str).apply(lambda x: re.sub(r'\D', '', x))
df['Rent'] = pd.to_numeric(df['Rent'], errors='coerce')

# Remove rows where 'Rent' is greater than 10,000,000 in "HSR Layout"
df = df[~((df['Location'] == 'HSR Layout') & (df['Rent'] > 10000000))]

# Drop rows where target or 'No. of Bedroom' is missing
df = df.dropna(subset=['Rent', 'No. of Bedroom'])

# Handle non-numeric 'Age of Building' by converting to a numeric value
df['Age of Building'] = df['Age of Building'].replace('Newly Constructed', 0)
df['Age of Building'] = pd.to_numeric(df['Age of Building'], errors='coerce')

# Add the Cost_per_SqFt column
df['Cost_per_SqFt'] = df['Rent'] / df['Total SqFt']

# Replace infinite values that might arise from division by zero
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Handle missing values in Cost_per_SqFt
df['Cost_per_SqFt'] = df['Cost_per_SqFt'].fillna(df['Cost_per_SqFt'].median())

# Drop columns with all missing values
df = df.dropna(axis=1, how='all')

# Custom Transformer for grouped imputation
class GroupImputer(BaseEstimator, TransformerMixin):
    def __init__(self, group_cols, impute_col):
        self.group_cols = group_cols
        self.impute_col = impute_col
        self.fill_values = {}

    def fit(self, X, y=None):
        self.fill_values = (
            X.groupby(self.group_cols)[self.impute_col]
            .agg(lambda x: x.value_counts().index[0] if not x.isnull().all() else np.nan)
            .to_dict()
        )
        return self

    def transform(self, X):
        for key, value in self.fill_values.items():
            mask = (X[self.group_cols[0]] == key[0]) & (X[self.group_cols[1]] == key[1])
            X.loc[mask, self.impute_col] = X.loc[mask, self.impute_col].fillna(value)
        return X

# Apply grouped imputation for categorical variables
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

for col in categorical_features:
    if col not in ['Location', 'No. of Bedroom']:  # Exclude grouping columns themselves
        imputer = GroupImputer(group_cols=['Location', 'No. of Bedroom'], impute_col=col)
        df = imputer.fit_transform(df)

# Additional step: Impute any remaining missing values in numeric columns
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
imputer_numeric = SimpleImputer(strategy='mean')
df[numeric_features] = imputer_numeric.fit_transform(df[numeric_features])

# Prepare features and target
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

# Remove 'Rent' from numeric features if it was mistakenly included
if 'Rent' in numeric_features:
    numeric_features.remove('Rent')

# Ensure 'Cost_per_SqFt' is included in numeric features
if 'Cost_per_SqFt' not in numeric_features:
    numeric_features.append('Cost_per_SqFt')

# Define preprocessor with scaling and one-hot encoding
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the data
X = df.drop(columns=['Rent'])  # Features include the new Cost_per_SqFt column
y = df['Rent']
X = preprocessor.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [151]:
df.shape

(4903, 23)

In [144]:
# Random Forest Model
rf_model = RandomForestRegressor(random_state=0)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print(f'Random Forest - Mean Absolute Error: {mae_rf}')


Random Forest - Mean Absolute Error: 1676.3486646279307


In [150]:
# Calculate the absolute errors
errors = abs(y_pred_rf - y_test)

# Calculate the percentage of predictions with an error margin of less than 1000
percentage_within_1000 = (errors < 2000).mean() * 100

print(f'Percentage of predictions with error < 1000: {percentage_within_1000:.2f}%')

Percentage of predictions with error < 1000: 92.76%


In [112]:
# Gradient Boosting Model
gb_model = GradientBoostingRegressor(random_state=0)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
mae_gb = mean_absolute_error(y_test, y_pred_gb)
print(f'Gradient Boosting - Mean Absolute Error: {mae_gb}')

Gradient Boosting - Mean Absolute Error: 1983.5221368649113


In [115]:
# Calculate the absolute errors
errors = abs(y_pred_gb - y_test)

# Calculate the percentage of predictions with an error margin of less than 1000
percentage_within_1000 = (errors < 1000).mean() * 100

print(f'Percentage of predictions with error < 1000: {percentage_within_1000:.2f}%')

Percentage of predictions with error < 1000: 59.94%
