# **Machine Learning Project -  House Price Prediction :**
- ### Write a program that takes in data on houses as input and uses machine-learning algorithms to predict the retail price of each house. The program should be able to handle a variety of features, such as the number of rooms, size, age, location, etc., and use these features to train multiple machine learning models.
- ### Your task is to compare the accuracy of the different models and identify which model performs best for predicting house prices. You can use metrics such as mean squared error or R-squared value to evaluate the performance of each model.

---
# Importing all required modules :

In [None]:
!pip install scikit-learn
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score



# Reading a CSV file named **"Housing.csv"** and creating a Pandas DataFrame named **'data'** :

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Data_Science/Projects/ML Projects/Datasets/Housing.csv')
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


# Checking more details about our dataframe :

In [None]:
data.shape

(545, 13)

In [None]:
data.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [None]:
data.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


# Removing rows with missing values from the DataFrame **'data'** :

In [None]:
data.dropna(inplace = True)
data.isnull().sum().sum()

0

# Converting text columns to numerical using one-hot encoding :

In [None]:
data = pd.get_dummies(data, columns=['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus'], drop_first=True, dtype=int)
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,1,0,0,0,1,1,0,0
1,12250000,8960,4,4,4,3,1,0,0,0,1,0,0,0
2,12250000,9960,3,2,2,2,1,0,1,0,0,1,1,0
3,12215000,7500,4,2,2,3,1,0,1,0,1,1,0,0
4,11410000,7420,4,1,2,2,1,1,1,0,1,0,0,0


# Separating features and target variable :

In [None]:
x = data.drop(['price','hotwaterheating_yes','airconditioning_yes'], axis=1)
y = data['price']

# Splitting **'data'** into training and testing sets :


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## Creating a dictionary named **'models'** which stores machine learning models with their names as keys and model instances as values :

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(), # We use this for regression tasks, where you want to predict a
                                              # continuous output.
    'Random Forest': RandomForestRegressor()
}

 ## Iterating through each machine learning model in the **'models'** dictionary, trains it on the training data (x_train, y_train), and then evaluates its performance on the test data (x_test, y_test) :

In [None]:
for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    score = model.score(x_test, y_test)

    print("Model:", name)

    if hasattr(model, 'coef_'):  # Checks if the model has coefficients and if so, prints them.
        print('Coefficients:', model.coef_)

    if hasattr(model, 'feature_importances_') :  # Checks if a model has feature importances
        print('Feature Importances:', model.feature_importances_)  # and prints them if it does.

    print("Score:", score)
    print("Mean Squared Error:", mse)
    print("R-squared:", r2)
    print("---")

Model: Linear Regression
Coefficients: [ 2.54530913e+02  8.24080792e+04  1.14195862e+06  5.36375800e+05
  2.82148773e+05  3.66389442e+05  2.81002998e+05  4.72199464e+05
  6.24141372e+05 -1.95164135e+05 -4.80230164e+05]
Score: 0.5998365952541895
Mean Squared Error: 2022654039310.1807
R-squared: 0.5998365952541895
---
Model: Decision Tree
Feature Importances: [0.51298899 0.04707894 0.17845852 0.06991322 0.0554404  0.0068582
 0.02133838 0.02489106 0.03446936 0.00878138 0.03978155]
Score: 0.45664810053378924
Mean Squared Error: 2746410344344.954
R-squared: 0.45664810053378924
---
Model: Random Forest
Feature Importances: [0.48985952 0.05080551 0.17424428 0.06598262 0.06439444 0.00861611
 0.01937974 0.03741471 0.0330508  0.01781828 0.03843399]
Score: 0.5841594405509738
Mean Squared Error: 2101895319020.6338
R-squared: 0.5841594405509738
---


## The output shows that Linear Regression performs best with the highest score (0.59) and lowest error, followed by Random Forest and then Decision Tree.
---
- ## Taking input as NumPy array, reshaping it to be a single sample for prediction, and then iterating through each model in the **'models'** dictionary and predicts the price based on the provided input :

In [None]:
new_input = np.array([7420,	4, 2, 3, 2,	1, 0,	0, 1,	0, 0])
new_input = new_input.reshape(1, -1)

for name, model in models.items():
  price = model.predict(new_input)
  print('Model:', name)
  print('Price:', price)
  print('---')

Model: Linear Regression
Price: [7777310.92242832]
---
Model: Decision Tree
Price: [8400000.]
---
Model: Random Forest
Price: [8044783.6]
---


