In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error, r2_score
from sklearn import linear_model
from sklearn.preprocessing import  PolynomialFeatures
from sklearn.model_selection import train_test_split

## Loading the Data

In [None]:
data = pd.read_csv("House_Rent_Dataset.csv")

## Basic Inspection

In [None]:
data.head(10)

In [None]:
data.info()

In [None]:
data.isnull().sum()

## Simple Exploratory Data Analysis

In [None]:
plt.bar(data['BHK'].value_counts().index, data['BHK'].value_counts().values)

In [None]:
sns.distplot(data['Rent'])

In [None]:
sns.distplot(data['Size'])

In [None]:
data['Floor'].value_counts()

In [None]:
data['Furnishing Status'].value_counts()

In [None]:
data['Area Locality'].value_counts()

In [None]:
data['City'].value_counts()

## Preprocessing

### Addition and Modication of Columns

In [None]:
# Floor & Total Floors Columns

# Sets the value of "Ground" to 1
def update_floor_value(value):
    return value.replace('Ground', '1')

# Exclude rows containing 'Upper Basement' or 'Lower Basement'
data = data[~data['Floor'].str.contains('Upper Basement|Lower Basement')]

# Update floor values
data['Floor'] = data['Floor'].apply(update_floor_value)

# Create a new 'Total Floors' column
data['Total Floors'] = data['Floor'].str.extract(r'(\d+) out of (\d+)|(\d+)')[0]

# Create a new 'Floor' column
data['Floor'] = data['Floor'].str.extract(r'(\d+)')

# Drop rows where 'Total Floors' is greater than 'Floor'
data = data[data['Total Floors'] <= data['Floor']]

# Drop rows where 'Total Floors' is NaN
data = data.dropna(subset=['Total Floors'])

# Display the modified DataFrame
data.head()

### Conversion of Categorical Variable to One-Hot Encoding

In [None]:
data = data[['BHK', 
            'Bathroom', 
            'Furnishing Status', 
            'Rent',
            'Size',
            'Bathroom',
            'Area Type',
            'City',
            'Point of Contact',
            'Floor',
            'Total Floors',
            'Tenant Preferred'
            ]]

In [None]:
def one_hot_encode(data, column):
    encoded = pd.get_dummies(data[column], drop_first= True)
    data = data.drop(column, axis = 1)
    data = data.join(encoded)
    return data

In [None]:
data = one_hot_encode(data, 'Furnishing Status')
data = one_hot_encode(data, 'Area Type')
data = one_hot_encode(data, 'City')
data = one_hot_encode(data, 'Point of Contact')
data = one_hot_encode(data, 'Tenant Preferred')

In [None]:
data

In [None]:
# Dealing with Outliers

# Calculate the first and third quartiles (Q1 and Q3)
Q1 = data['Rent'].quantile(0.28)
Q3 = data['Rent'].quantile(0.72)

# Calculate the Interquartile Range (IQR)
IQR = Q3 - Q1

# Define lower and upper boundaries for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers using boolean indexing
outliers = (data['Rent'] < lower_bound) | (data['Rent'] > upper_bound)

# Remove outliers from the dataset
data = data[~outliers]


### Training and Test Split

In [None]:
X = data.drop('Rent', axis= 1)
y = data['Rent']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 42)

### Standardizing the Values

In [None]:
#Instead of StandardScaler, I used PolynomialFeatures

pf = PolynomialFeatures()
X_train = pf.fit_transform(X_train)
X_test = pf.transform(X_test)

## Modelling

In [None]:
model = linear_model.LinearRegression()
model.fit(X_train, y_train)

## Evaluation

### Quantitative Evaluation

In [None]:
y_preds = model.predict(X_test)

In [None]:
print("Coefficients: \n", model.coef_)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_preds))
print("Coefficient of determination (Polynomial): %.2f" % r2_score(y_test, y_preds))

In [None]:
print(f"Regression-Training set score: {model.score(X_train, y_train):.2f}")
print(f"Regression-Test set score: {model.score(X_test, y_test):.2f}")

In [None]:
residuals = y_test - y_preds
sns.scatterplot(x=y_preds, y=residuals)

### Qualitative Evaluation

In [None]:
sample_data = X.iloc[0]
sample_data

In [None]:
sample_data_standardized = pf.transform(X.iloc[0].values.reshape(1,-1))
sample_data_standardized


In [None]:
model_rent_forecast = model.predict(sample_data_standardized)[0]
model_rent_forecast

print(f'Predicted rent by the model: {model_rent_forecast}')
print(f'Actual rent: {y.iloc[0]}')

In [None]:
plt.figure(figsize=(8,6))
plt.plot(y_test, y_test, 'o', alpha=0.5, color='green', label='Actual Rent Prices')
plt.plot(y_test, y_preds, 'o', alpha=0.5, color='skyblue', label='Predicted Rent Prices')

plt.title("Actual vs. Predicted Rent Prices")
plt.xlabel("Actual Rent Prices")
plt.ylabel("Predicted Rent Prices")

plt.legend()
plt.grid(True)
plt.show()