## Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Data Cleaning And Preprocessing

### Loading the dataset

In [2]:
df = pd.read_csv('csvdata.csv')

### Displaying Data and info

In [4]:
print(df.head())
print(df.info())

   Unnamed: 0       City     Price  Area                         Location  \
0           0  Bangalore  30000000  3340                 JP Nagar Phase 1   
1           1  Bangalore   7888000  1045       Dasarahalli on Tumkur Road   
2           2  Bangalore   4866000  1179  Kannur on Thanisandra Main Road   
3           3  Bangalore   8358000  1675                     Doddanekundi   
4           4  Bangalore   6845000  1670                          Kengeri   

   No. of Bedrooms  
0                4  
1                2  
2                2  
3                3  
4                3  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29135 entries, 0 to 29134
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       29135 non-null  int64 
 1   City             29135 non-null  object
 2   Price            29135 non-null  int64 
 3   Area             29135 non-null  int64 
 4   Location         29135 non-n

### Checking for null values

In [5]:
# Check for missing values
print(df.isnull().sum())

Unnamed: 0         0
City               0
Price              0
Area               0
Location           0
No. of Bedrooms    0
dtype: int64


### Spliting dataset into X and Y

In [7]:
# Separate features and target variable
X = df.drop(['Price'], axis=1)
y = df['Price']
print ('Values of X:\n',X)
print ('\n\n Values of Y:\n',y)

Values of X:
        Unnamed: 0       City  Area                         Location  \
0               0  Bangalore  3340                 JP Nagar Phase 1   
1               1  Bangalore  1045       Dasarahalli on Tumkur Road   
2               2  Bangalore  1179  Kannur on Thanisandra Main Road   
3               3  Bangalore  1675                     Doddanekundi   
4               4  Bangalore  1670                          Kengeri   
...           ...        ...   ...                              ...   
29130        7714     Mumbai  1180                   Mira Road East   
29131        7715     Mumbai   530                     Naigaon East   
29132        7716     Mumbai   700                         Shirgaon   
29133        7717     Mumbai   995                   Mira Road East   
29134        7718     Mumbai  1020                   Mira Road East   

       No. of Bedrooms  
0                    4  
1                    2  
2                    2  
3                    3  
4       

### Data Preprocessing

In [9]:
# Handle missing values and preprocess data
numeric_features = ['Area', 'No. of Bedrooms']
categorical_features = ['City', 'Location']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Apply transformations to numeric features
X[numeric_features] = numeric_transformer.fit_transform(X[numeric_features])

# Label encoding for categorical features
label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    X[feature] = le.fit_transform(X[feature])
    label_encoders[feature] = le

# Verify the transformations
print(X.head())

   Unnamed: 0  City      Area  Location  No. of Bedrooms
0           0     0  2.654406       595         1.923170
1           1     0 -0.334462       374        -0.512879
2           2     0 -0.159949       698        -0.512879
3           3     0  0.486012       398         0.705146
4           4     0  0.479500       731         0.705146


## Model Training

### Spliting data into train set and test set

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Linear Regression

In [11]:
# Train the model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predict on the test set
y_pred_linear = linear_model.predict(X_test)

### Random Forest Regressor

In [12]:
# Train a Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set with the Random Forest model
y_pred_rf = rf_model.predict(X_test)

### Gradient Boosting Regressor

In [13]:
# Train a Gradient Boosting model
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

# Predict on the test set with the Gradient Boosting model
y_pred_gb = gb_model.predict(X_test)

## Model Evaluation And Fine Tunning

### Evaluating Linear Regression

In [14]:
# Evaluate Linear Regression model
mae_linear = mean_absolute_error(y_test, y_pred_linear)
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print(f"Linear Regression - Mean Absolute Error: {mae_linear}")
print(f"Linear Regression - Mean Squared Error: {mse_linear}")
print(f"Linear Regression - R-squared: {r2_linear}")

Linear Regression - Mean Absolute Error: 8579466.740569286
Linear Regression - Mean Squared Error: 703275791086038.1
Linear Regression - R-squared: 0.052475605527863345


### Evaluating Random Forest Regressor

In [15]:
# Evaluate Random Forest model
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest - Mean Absolute Error: {mae_rf}")
print(f"Random Forest - Mean Squared Error: {mse_rf}")
print(f"Random Forest - R-squared: {r2_rf}")

Random Forest - Mean Absolute Error: 7454872.957259311
Random Forest - Mean Squared Error: 669742602080774.2
Random Forest - R-squared: 0.09765491499601087


### Evaluating Gradient Boosting Regressor

In [16]:
# Evaluate Gradient Boosting model
mae_gb = mean_absolute_error(y_test, y_pred_gb)
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print(f"Gradient Boosting - Mean Absolute Error: {mae_gb}")
print(f"Gradient Boosting - Mean Squared Error: {mse_gb}")
print(f"Gradient Boosting - R-squared: {r2_gb}")

Gradient Boosting - Mean Absolute Error: 7959380.515222927
Gradient Boosting - Mean Squared Error: 715306314890397.2
Gradient Boosting - R-squared: 0.03626686504881926
