In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set(style="whitegrid")
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('dubai_properties.csv')

In [3]:
print(df.isnull().sum())

Address                     0
Rent                        0
Beds                        0
Baths                       0
Type                        0
Area_in_sqft                0
Rent_per_sqft               0
Rent_category               0
Frequency                   0
Furnishing                  0
Purpose                     0
Posted_date                 0
Age_of_listing_in_days      0
Location                    0
City                        0
Latitude                  719
Longitude                 719
dtype: int64


In [4]:
# Remove rows with null values in the 'Latitude' and 'Longitude' columns
df.dropna(subset=['Latitude', 'Longitude'], inplace=True)

# Verify the shape of the DataFrame after removing null values
print("Data shape after removing null values in Latitude and Longitude columns:", df.shape)


Data shape after removing null values in Latitude and Longitude columns: (73023, 17)


In [6]:
# Calculating correlation between independent variables and target variable
correlation = df[['Rent', 'Beds', 'Baths','Area_in_sqft','Rent_per_sqft','Age_of_listing_in_days', 'Latitude','Longitude']].corrwith(df['Rent'])

print("Correlation with 'Rents':")
print(correlation)

Correlation with 'Rents':
Rent                      1.000000
Beds                      0.311179
Baths                     0.080013
Area_in_sqft              0.410185
Rent_per_sqft             0.348614
Age_of_listing_in_days    0.002431
Latitude                  0.027837
Longitude                 0.007521
dtype: float64


In [8]:
import pandas as pd

# List of columns you want to check for outliers
columns = ['Rent', 'Beds', 'Baths', 'Area_in_sqft', 'Rent_per_sqft']

# Function to remove outliers in a dataframe
def remove_outliers(df, columns):
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Filter out outliers and keep data within the interquartile range
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    
    return df

# Apply the function to your dataframe
df_clean = remove_outliers(df, columns)

# You can check the shape of the original and cleaned dataframes
print("Original DataFrame shape:", df.shape)
print("Cleaned DataFrame shape:", df_clean.shape)


Original DataFrame shape: (73023, 17)
Cleaned DataFrame shape: (56776, 17)


In [9]:
x = df_clean[['Beds', 'Baths', 'Area_in_sqft', 'Rent_per_sqft']]
y = df_clean['Rent']

# Splitting the dataset into training and testing sets (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("Shape of X_train:", x_train.shape)
print("Shape of X_test:", x_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Shape of X_train: (45420, 4)
Shape of X_test: (11356, 4)
Shape of y_train: (45420,)
Shape of y_test: (11356,)


In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Assuming x_train and y_train are your training features and target respectively
# Assuming x_test and y_test are your test features and target respectively

# Initialize the linear regression model
model = LinearRegression()

# Train the model on the training data
model.fit(x_train, y_train)

# Predict target values on the test data
y_pred = model.predict(x_test)

# Calculate mean squared error (MSE) to evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)

Mean Squared Error: 524474821.4641857
R^2 Score: 0.8808222226092947


In [13]:
# making models to use

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import warnings

linreg = LinearRegression()
ridge = Ridge()
lasso = Lasso()

models = [linreg,ridge,lasso]
model_name = ["LinearRegression","Ridge","Lasso"]

average_rmse = []
average_r2_score = []
for model, name in zip(models,model_name):
    scores = ['neg_root_mean_squared_error',"r2"]
    cv_score_rmse = cross_val_score(model,x_train,y_train,cv=5,scoring = 'neg_root_mean_squared_error')
    cv_score_r2 = cross_val_score(model,x_train,y_train,cv=5,scoring = 'r2')
    average_rmse.append(np.mean(-1 * cv_score_rmse))
    average_r2_score.append(np.mean(cv_score_r2))
    print("RMSE scores of {} are {}".format(name,-1 * cv_score_rmse))
    print("R2 scores of {} are {}".format(name, cv_score_r2))
    #print("R2 score of {} are {}".format(name,cv_score))
    print("-" * 100)
    

RMSE scores of LinearRegression are [23097.98418442 23379.83299032 23302.01538419 22875.38726464
 22809.78310681]
R2 scores of LinearRegression are [0.88193341 0.87832261 0.87776111 0.88073267 0.87992311]
----------------------------------------------------------------------------------------------------
RMSE scores of Ridge are [23097.99406146 23379.84113813 23302.00840692 22875.3821682
 22809.7766647 ]
R2 scores of Ridge are [0.88193331 0.87832253 0.87776118 0.88073272 0.87992318]
----------------------------------------------------------------------------------------------------
RMSE scores of Lasso are [23098.01713695 23379.86243033 23301.99153028 22875.36910533
 22809.76338374]
R2 scores of Lasso are [0.88193307 0.8783223  0.87776136 0.88073286 0.87992332]
----------------------------------------------------------------------------------------------------


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Initialize the GradientBoostingRegressor model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=0)

# Fit the model on the training data
model.fit(x_train, y_train)

# Make predictions on the test set
predictions = model.predict(x_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)


Mean Squared Error: 2806846335.588547
R^2 Score: 0.9640235249024395


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Initialize the GradientBoostingRegressor model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=0)

# Fit the model on the training data
model.fit(x_train, y_train)

# Make predictions on the test set
predictions = model.predict(x_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)


Mean Squared Error: 15650147.46399403
R^2 Score: 0.9964437762991394
