In [5]:
# Loading the required libraries 
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Importing the csv file 
flats_df = pd.read_csv('/Users/guliaharsh021/Downloads/DA Documents /Projects/Project 1/Data Prepration, Processing and Analysis/Data Exploration and Cleaning/Cleaned Data/flats_data_cleaned.csv')

# Selecting the required columns
flats_df = flats_df[['Property Type', 'City/Locality', 'BHK', 'Property Size (sqft)', 'Furnishing', 'Price (INR)', 'Price (per sqft)']].reset_index(drop=True)

# Handling categorical data and missing values
flats_df = flats_df.dropna()

# Handling categorical data and missing values
flats_df = flats_df.dropna()

# removing the outliers 
q1 = flats_df['Property Size (sqft)'].quantile(0.25)
q3 = flats_df['Property Size (sqft)'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
flats_df = flats_df[(flats_df['Property Size (sqft)'] >= lower_bound) & (flats_df['Property Size (sqft)'] <= upper_bound)]



# Splitting the features and target variable
X = flats_df[['Property Type', 'City/Locality', 'BHK', 'Furnishing']]
y = flats_df['Price (INR)']

# Preprocessing the data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['BHK']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Property Type', 'City/Locality', 'Furnishing'])
    ])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Multiple Linear Regression
linear_model = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', LinearRegression())])

linear_model.fit(X_train, y_train)
y_train_pred = linear_model.predict(X_train)
y_test_pred = linear_model.predict(X_test)

print("Multiple Linear Regression:")
print(f"Train R^2: {r2_score(y_train, y_train_pred)}")
print(f"Train MSE: {mean_squared_error(y_train, y_train_pred)}")
print(f"Test R^2: {r2_score(y_test, y_test_pred)}")
print(f"Test MSE: {mean_squared_error(y_test, y_test_pred)}")

# Polynomial Regression
poly_features = PolynomialFeatures(degree=2)
X_train_poly = poly_features.fit_transform(preprocessor.fit_transform(X_train))
X_test_poly = poly_features.transform(preprocessor.transform(X_test))

poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)
y_train_poly_pred = poly_model.predict(X_train_poly)
y_test_poly_pred = poly_model.predict(X_test_poly)

print("Polynomial Regression:")
print(f"Train R^2: {r2_score(y_train, y_train_poly_pred)}")
print(f"Train MSE: {mean_squared_error(y_train, y_train_poly_pred)}")
print(f"Test R^2: {r2_score(y_test, y_test_poly_pred)}")
print(f"Test MSE: {mean_squared_error(y_test, y_test_poly_pred)}")

# Ridge Regression with Grid Search
ridge_model = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', Ridge())])

param_grid = {'regressor__alpha': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_ridge_model = grid_search.best_estimator_
y_train_ridge_pred = best_ridge_model.predict(X_train)
y_test_ridge_pred = best_ridge_model.predict(X_test)

print("Ridge Regression with Grid Search:")
print(f"Best alpha: {grid_search.best_params_['regressor__alpha']}")
print(f"Train R^2: {r2_score(y_train, y_train_ridge_pred)}")
print(f"Train MSE: {mean_squared_error(y_train, y_train_ridge_pred)}")
print(f"Test R^2: {r2_score(y_test, y_test_ridge_pred)}")
print(f"Test MSE: {mean_squared_error(y_test, y_test_ridge_pred)}")

# Cross-Validation Scores
linear_cv_scores = cross_val_score(linear_model, X, y, cv=5, scoring='neg_mean_squared_error')
ridge_cv_scores = cross_val_score(best_ridge_model, X, y, cv=5, scoring='neg_mean_squared_error')

print("Cross-Validation Scores:")
print(f"Linear Regression CV MSE: {-np.mean(linear_cv_scores)}")
print(f"Ridge Regression CV MSE: {-np.mean(ridge_cv_scores)}")

Multiple Linear Regression:
Train R^2: 0.28863248601563085
Train MSE: 441526928871719.25
Test R^2: 0.2607011235488309
Test MSE: 456583057522859.8
Polynomial Regression:
Train R^2: 0.3754436787906006
Train MSE: 387645526384079.94
Test R^2: 0.3189051856884484
Test MSE: 420636852951950.6
Ridge Regression with Grid Search:
Best alpha: 1
Train R^2: 0.2880890183829007
Train MSE: 441864244802097.56
Test R^2: 0.2619903525394979
Test MSE: 455786843524495.0
Cross-Validation Scores:
Linear Regression CV MSE: 530422166123366.1
Ridge Regression CV MSE: 529387290826396.3


In [None]:
# One-hot encode categorical variables
encoded_df = pd.get_dummies(flats_df, columns=['Property Type', 'City/Locality', 'Furnishing'])

# Select numerical columns and append the encoded categorical columns
numerical_df = encoded_df[['Property Size (sqft)', 'Price (INR)', 'Price (per sqft)', 'BHK']]
flats_df = pd.concat([numerical_df, encoded_df.drop(columns=['Property Size (sqft)', 'Price (INR)', 'Price (per sqft)', 'BHK'])], axis=1)

# printing the final dataframe 
flats_df.head()