In [1]:
# Import Pandas
# To work with DataFrames we use Pandas
# We use pandas for data manipulation, data cleaning and data analysis
# Pandas is utilised to format the data structure. It means it can convert any data structure into a Pandas DataFrame
import pandas as pd

# Import Numpy
# To find Statistical Summaries we use Numpy
import numpy as np

# Import subpackage of matplotlib 
import matplotlib.pyplot as plt

# Import Seaborn
import seaborn as sns

# To Supress Warnings
from warnings import filterwarnings
filterwarnings('ignore')



### Load the dataset

In [2]:
# Load the data and name it as df_house_price
df_house_price = pd.read_csv('house_price.csv')

#display the first five observations using head()
df_house_price.head()

Unnamed: 0,price,resid_area,air_qual,room_num,age,distance,teachers,poor_prop,airport,n_hos_beds,n_hot_rooms,waterbody,rainfall,bus_ter,parks,Sold
0,24.0,32.31,0.538,6.575,65.2,4.35,24.7,4.98,YES,5.48,11.192,River,23,YES,0.049347,0
1,21.6,37.07,0.469,6.421,78.9,4.99,22.2,9.14,NO,7.332,12.1728,Lake,42,YES,0.046146,1
2,34.7,37.07,0.469,7.185,61.1,5.03,22.2,4.03,NO,7.394,101.12,,38,YES,0.045764,0
3,33.4,32.18,0.458,6.998,45.8,6.21,21.3,2.94,YES,9.268,11.2672,Lake,45,YES,0.047151,0
4,36.2,32.18,0.458,7.147,54.2,6.16,21.3,5.33,NO,8.824,11.2896,Lake,55,YES,0.039474,0


## Data Overview 

In [3]:
# Name the Coloumns
df_house_price.columns

Index(['price', 'resid_area', 'air_qual', 'room_num', 'age', 'distance',
       'teachers', 'poor_prop', 'airport', 'n_hos_beds', 'n_hot_rooms',
       'waterbody', 'rainfall', 'bus_ter', 'parks', 'Sold'],
      dtype='object')

**Interpretation**

- `price` : Price of the House. 
- `resid_area` : Resedential Area where my house is located
- `air_qual` : Air Quality near the house
- ``
- ``
  

In [4]:
# Basic Info of the Dataset
df_house_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   price        506 non-null    float64
 1   resid_area   506 non-null    float64
 2   air_qual     506 non-null    float64
 3   room_num     506 non-null    float64
 4   age          506 non-null    float64
 5   distance     506 non-null    float64
 6   teachers     506 non-null    float64
 7   poor_prop    506 non-null    float64
 8   airport      506 non-null    object 
 9   n_hos_beds   498 non-null    float64
 10  n_hot_rooms  506 non-null    float64
 11  waterbody    351 non-null    object 
 12  rainfall     506 non-null    int64  
 13  bus_ter      506 non-null    object 
 14  parks        506 non-null    float64
 15  Sold         506 non-null    int64  
dtypes: float64(11), int64(2), object(3)
memory usage: 63.4+ KB


**Interpretation**
 - We have 13 Numerical columns & 3 Categorical Columns
 - We have null values in the data
 - We have total 16 columns in the data
  

In [5]:
# Shape of the data
df_house_price.shape

(506, 16)

**Interpretations** 
- We have total 16 columns 
- We have 506 Observations

**Interpretation**
 - We have outliers in the data , most of the outliers are in age coloumn

## Data Preprocessing

### A . Data Cleaning

In [6]:
# Show the First 5 Observations
df_house_price.head()

Unnamed: 0,price,resid_area,air_qual,room_num,age,distance,teachers,poor_prop,airport,n_hos_beds,n_hot_rooms,waterbody,rainfall,bus_ter,parks,Sold
0,24.0,32.31,0.538,6.575,65.2,4.35,24.7,4.98,YES,5.48,11.192,River,23,YES,0.049347,0
1,21.6,37.07,0.469,6.421,78.9,4.99,22.2,9.14,NO,7.332,12.1728,Lake,42,YES,0.046146,1
2,34.7,37.07,0.469,7.185,61.1,5.03,22.2,4.03,NO,7.394,101.12,,38,YES,0.045764,0
3,33.4,32.18,0.458,6.998,45.8,6.21,21.3,2.94,YES,9.268,11.2672,Lake,45,YES,0.047151,0
4,36.2,32.18,0.458,7.147,54.2,6.16,21.3,5.33,NO,8.824,11.2896,Lake,55,YES,0.039474,0


In [7]:
# Rename the Columns
# Step 1 : Show the Column Names
df_house_price.columns

Index(['price', 'resid_area', 'air_qual', 'room_num', 'age', 'distance',
       'teachers', 'poor_prop', 'airport', 'n_hos_beds', 'n_hot_rooms',
       'waterbody', 'rainfall', 'bus_ter', 'parks', 'Sold'],
      dtype='object')

In [8]:
# Step 2 : Allocate ew column names to Old Column Names
# Make the Dictionary {'Old Column Name' : 'New Column Name'}
d1 = {'price' : 'Price',
      'resid_area' : 'Residential Area',
      'air_qual' : 'Air Quality',
      'room_num' : 'Room Numbers',
      'age' : 'Age',
      'distance' : 'Distance',
      'teachers' : 'Teachers',
      'poor_prop' : 'Proportion of Poor Population',
      'airport' : 'Airport',
      'n_hos_beds' : 'Number of Hospital Beds',
      'n_hot_rooms' : 'Number of Hotel Rooms',
      'waterbody' : 'Waterbody',
      'rainfall' : 'Rainfall',
      'bus_ter' : 'Bus Terminal',
      'parks' : 'Parks',
      'Sold' : 'Sold'}


In [9]:
df_house_price = df_house_price.rename(columns = d1)

In [10]:
# Authenticate the Results
df_house_price.columns

Index(['Price', 'Residential Area', 'Air Quality', 'Room Numbers', 'Age',
       'Distance', 'Teachers', 'Proportion of Poor Population', 'Airport',
       'Number of Hospital Beds', 'Number of Hotel Rooms', 'Waterbody',
       'Rainfall', 'Bus Terminal', 'Parks', 'Sold'],
      dtype='object')

### B. Null Value Handling

```
Laws that we need to follow while processing the null values (Specially for Manufacturing)
 - When we have less than 15% data missing -> Remove the observation
 - Between 20% to 70% data missing -> Impute the null values
 - Greater than 75% Values missing -> Remove the Column
```

In [11]:
# Find the Valid Null Values
df_house_price.isnull().sum()

Price                              0
Residential Area                   0
Air Quality                        0
Room Numbers                       0
Age                                0
Distance                           0
Teachers                           0
Proportion of Poor Population      0
Airport                            0
Number of Hospital Beds            8
Number of Hotel Rooms              0
Waterbody                        155
Rainfall                           0
Bus Terminal                       0
Parks                              0
Sold                               0
dtype: int64

In [12]:
# Percentage of Missing Values
round(df_house_price.isnull().sum() / df_house_price.shape[0] * 100),3

(Price                             0.0
 Residential Area                  0.0
 Air Quality                       0.0
 Room Numbers                      0.0
 Age                               0.0
 Distance                          0.0
 Teachers                          0.0
 Proportion of Poor Population     0.0
 Airport                           0.0
 Number of Hospital Beds           2.0
 Number of Hotel Rooms             0.0
 Waterbody                        31.0
 Rainfall                          0.0
 Bus Terminal                      0.0
 Parks                             0.0
 Sold                              0.0
 dtype: float64,
 3)

In [13]:
# Find the Nature of the Column 
df_house_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Price                          506 non-null    float64
 1   Residential Area               506 non-null    float64
 2   Air Quality                    506 non-null    float64
 3   Room Numbers                   506 non-null    float64
 4   Age                            506 non-null    float64
 5   Distance                       506 non-null    float64
 6   Teachers                       506 non-null    float64
 7   Proportion of Poor Population  506 non-null    float64
 8   Airport                        506 non-null    object 
 9   Number of Hospital Beds        498 non-null    float64
 10  Number of Hotel Rooms          506 non-null    float64
 11  Waterbody                      351 non-null    object 
 12  Rainfall                       506 non-null    int

In [14]:
# This is a Categorical Column so we can replace the null with the mode
df_house_price.Waterbody.mode()


0    River
Name: Waterbody, dtype: object

In [15]:
# Replace the Null Values
df_house_price.Waterbody =  df_house_price.Waterbody.fillna('River')

In [16]:
round(df_house_price.isnull().sum() / df_house_price.shape[0] * 100),3

(Price                            0.0
 Residential Area                 0.0
 Air Quality                      0.0
 Room Numbers                     0.0
 Age                              0.0
 Distance                         0.0
 Teachers                         0.0
 Proportion of Poor Population    0.0
 Airport                          0.0
 Number of Hospital Beds          2.0
 Number of Hotel Rooms            0.0
 Waterbody                        0.0
 Rainfall                         0.0
 Bus Terminal                     0.0
 Parks                            0.0
 Sold                             0.0
 dtype: float64,
 3)

In [17]:
# In the Column 'Number of Hospital Beds' so we can drop the null values
df_house_price.dropna(inplace=True)

In [18]:
round(df_house_price.isnull().sum() / df_house_price.shape[0] * 100),3

(Price                            0.0
 Residential Area                 0.0
 Air Quality                      0.0
 Room Numbers                     0.0
 Age                              0.0
 Distance                         0.0
 Teachers                         0.0
 Proportion of Poor Population    0.0
 Airport                          0.0
 Number of Hospital Beds          0.0
 Number of Hotel Rooms            0.0
 Waterbody                        0.0
 Rainfall                         0.0
 Bus Terminal                     0.0
 Parks                            0.0
 Sold                             0.0
 dtype: float64,
 3)

In [19]:
# Seperate the Target Variable and the Independent Variables
target = df_house_price['Sold']



In [20]:
# Independent Variables
df_indpendent = df_house_price.drop('Sold', axis = 1)