# Linear Regression

# Housing Price Dataset

### Load Data

In [2]:
# import the needed libraries
import pandas as pd     # Data manipulation and analysis
import numpy as np      # Numerical operations and working with arrays
import matplotlib.pyplot as plt  # Data visualization with basic plots
import seaborn as sns   # Statistical data visualization, built on top of matplotlib
from sklearn.model_selection import train_test_split  # Splitting datasets into training and testing sets
from sklearn.preprocessing import StandardScaler, OneHotEncoder  # Scaling features and encoding categorical variables
from sklearn.linear_model import LinearRegression  # Implementing linear regression model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score  # Evaluating regression model performance

In [3]:
# import the 'os' module to read the csv file 
import os
print(os.getcwd())

C:\Users\jahan


In [4]:
# Load the Housing data (csv file)
# Use a raw string: Prefix the file path with r to treat the backslashes as literal characters.
data = pd.read_csv(r'C:\Users\jahan\OneDrive\Documents\TeachingMachineLearning_CityColleges\data sets\Housing.csv')

### Initialize Exploration 

In [5]:
# step 1:
# checking the data structure with data.info()
print("Data Information:")
print(data.info())

Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB
None


In [6]:
# step 2: Understanding Data Statistics with data.describe()
print("Data Description:")
print(data.describe())

Data Description:
              price          area    bedrooms   bathrooms     stories  \
count  5.450000e+02    545.000000  545.000000  545.000000  545.000000   
mean   4.766729e+06   5150.541284    2.965138    1.286239    1.805505   
std    1.870440e+06   2170.141023    0.738064    0.502470    0.867492   
min    1.750000e+06   1650.000000    1.000000    1.000000    1.000000   
25%    3.430000e+06   3600.000000    2.000000    1.000000    1.000000   
50%    4.340000e+06   4600.000000    3.000000    1.000000    2.000000   
75%    5.740000e+06   6360.000000    3.000000    2.000000    2.000000   
max    1.330000e+07  16200.000000    6.000000    4.000000    4.000000   

          parking  
count  545.000000  
mean     0.693578  
std      0.861586  
min      0.000000  
25%      0.000000  
50%      0.000000  
75%      1.000000  
max      3.000000  


In [7]:
data.describe(include=["O"])

Unnamed: 0,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea,furnishingstatus
count,545,545,545,545,545,545,545
unique,2,2,2,2,2,2,3
top,yes,no,no,no,no,no,semi-furnished
freq,468,448,354,520,373,417,227


### Handling Missing Values

In [7]:
# step 1. introduce the problem of missing values: the missing values can lead to biased results or errors in analysis if not addressed properly.
# Handling missing values are essential in data preprocessing.

In [8]:
# step 2: Detecting missing values by using .isnull.sum()
# .isnull().sum(): checks missing values in each column

print("Missing values in each column:")
print(data.isnull().sum())

Missing values in each column:
price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


In [9]:
# if there is any missing value: 
# step 3.
 #use .dropna() function to remove all rows with missing values.
   # 1. data = data.dropna()
   # 2. we use this method when the number of missing values is small.
# fillna(); Imputing missing values: 
         # 1. Filling Missing values;
         # 2. Filling with median or mode as .median() or mode()[0].

### Encoding categorical variables (if necessary)

In [10]:
# encoding categorical variables into a numerical format
# In this dataset, we have some categorical variables like: mainroad, guestroom, basement, hotwaterheating, airconditioning, prefarea, furnishingstatus

In [11]:
# step 1: why we are using One-Hot encoding:
#One-Hot Encoding: Best for binary and nominal categorical variables, which covers all the categorical variables in this dataset.
# Label/Ordinal Encoding: Not needed, as no ordinal variables are present in Housing dataset.

In [10]:
# step 2: Using One-Hot Encoding with pd.get_dummies()
# using that function creates new binary (0 or 1)columns for each category in a categorical variables.

data = pd.get_dummies(data, drop_first=True)
print(data)

        price  area  bedrooms  bathrooms  stories  parking  mainroad_yes  \
0    13300000  7420         4          2        3        2          True   
1    12250000  8960         4          4        4        3          True   
2    12250000  9960         3          2        2        2          True   
3    12215000  7500         4          2        2        3          True   
4    11410000  7420         4          1        2        2          True   
..        ...   ...       ...        ...      ...      ...           ...   
540   1820000  3000         2          1        1        2          True   
541   1767150  2400         3          1        1        0         False   
542   1750000  3620         2          1        1        0          True   
543   1750000  2910         3          1        1        0         False   
544   1750000  3850         3          1        2        0          True   

     guestroom_yes  basement_yes  hotwaterheating_yes  airconditioning_yes  \
0        

In [13]:
# what is drop_first=True? using this code avoids multicollinearity.

### Seperating features and target

In [14]:
# step 1: Seperating features and target
  # features(X): the input data to make predictions. (independent variables)
    # target(y): the output data that you are trying to predict. (dependent variable)

In [11]:
# step 2: separate the features(x)
 # syntax: data.drop('target_column', axis =1): 
    # the drop() function is used to remove the target column from the dataset.
    # the argument axis = 1 specifies that we want to drop a column not raw. 
    
X = data.drop('price', axis =1)

In [12]:
 # step 3: seperating the target variable(y)
    #syntax: y = data['target_column']
    
 y = data['price']   

In [13]:
print(X)

     area  bedrooms  bathrooms  stories  parking  mainroad_yes  guestroom_yes  \
0    7420         4          2        3        2          True          False   
1    8960         4          4        4        3          True          False   
2    9960         3          2        2        2          True          False   
3    7500         4          2        2        3          True          False   
4    7420         4          1        2        2          True           True   
..    ...       ...        ...      ...      ...           ...            ...   
540  3000         2          1        1        2          True          False   
541  2400         3          1        1        0         False          False   
542  3620         2          1        1        0          True          False   
543  2910         3          1        1        0         False          False   
544  3850         3          1        2        0          True          False   

     basement_yes  hotwater

In [14]:
print(y) # this is the output that our machine learning model will try to predict based on features.

0      13300000
1      12250000
2      12250000
3      12215000
4      11410000
         ...   
540     1820000
541     1767150
542     1750000
543     1750000
544     1750000
Name: price, Length: 545, dtype: int64


In [18]:
# step 4: understand why we do this

In [15]:
data.columns # name of columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking',
       'mainroad_yes', 'guestroom_yes', 'basement_yes', 'hotwaterheating_yes',
       'airconditioning_yes', 'prefarea_yes',
       'furnishingstatus_semi-furnished', 'furnishingstatus_unfurnished'],
      dtype='object')

In [16]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   price                            545 non-null    int64
 1   area                             545 non-null    int64
 2   bedrooms                         545 non-null    int64
 3   bathrooms                        545 non-null    int64
 4   stories                          545 non-null    int64
 5   parking                          545 non-null    int64
 6   mainroad_yes                     545 non-null    bool 
 7   guestroom_yes                    545 non-null    bool 
 8   basement_yes                     545 non-null    bool 
 9   hotwaterheating_yes              545 non-null    bool 
 10  airconditioning_yes              545 non-null    bool 
 11  prefarea_yes                     545 non-null    bool 
 12  furnishingstatus_semi-furnished  545 non-null    b

In [17]:
data['furnishingstatus_semi-furnished'].sum() # number of furnishingstatu_semifurnished

227

In [18]:
data['furnishingstatus_semi-furnished'].value_counts()
# this displays how many times each furnishing status appears in the dataset

furnishingstatus_semi-furnished
False    318
True     227
Name: count, dtype: int64

#### step 5: Accessing a single column or multiple columns from the dataframe:

In [19]:
# syntax for single column: column_data = data['column_name']
bedrooms_column_data = data['bedrooms']
print('bedrooms_column_data')
print(bedrooms_column_data)

bedrooms_column_data
0      4
1      4
2      3
3      4
4      4
      ..
540    2
541    3
542    2
543    3
544    3
Name: bedrooms, Length: 545, dtype: int64


In [20]:
#syntax for multiple columns: 
selected_columns = data[['price', 'area', 'bedrooms']]
print('selected_columns:')
print(selected_columns)

selected_columns:
        price  area  bedrooms
0    13300000  7420         4
1    12250000  8960         4
2    12250000  9960         3
3    12215000  7500         4
4    11410000  7420         4
..        ...   ...       ...
540   1820000  3000         2
541   1767150  2400         3
542   1750000  3620         2
543   1750000  2910         3
544   1750000  3850         3

[545 rows x 3 columns]


#### step 6: Dropping specific columns

In [21]:
# dropping a single column
 # axis = 1: it means you are removing a column not a row

data_without_price = data.drop('price', axis =1)
print('data_without_price:')
print(data_without_price)

data_without_price:
     area  bedrooms  bathrooms  stories  parking  mainroad_yes  guestroom_yes  \
0    7420         4          2        3        2          True          False   
1    8960         4          4        4        3          True          False   
2    9960         3          2        2        2          True          False   
3    7500         4          2        2        3          True          False   
4    7420         4          1        2        2          True           True   
..    ...       ...        ...      ...      ...           ...            ...   
540  3000         2          1        1        2          True          False   
541  2400         3          1        1        0         False          False   
542  3620         2          1        1        0          True          False   
543  2910         3          1        1        0         False          False   
544  3850         3          1        2        0          True          False   

     ba

In [22]:
# droping multiple columns:
data_without_columns = data.drop(['price', 'area'], axis = 1)

In [23]:
print('data_without_columns(price and area):')
print(data_without_columns)

data_without_columns(price and area):
     bedrooms  bathrooms  stories  parking  mainroad_yes  guestroom_yes  \
0           4          2        3        2          True          False   
1           4          4        4        3          True          False   
2           3          2        2        2          True          False   
3           4          2        2        3          True          False   
4           4          1        2        2          True           True   
..        ...        ...      ...      ...           ...            ...   
540         2          1        1        2          True          False   
541         3          1        1        0         False          False   
542         2          1        1        0          True          False   
543         3          1        1        0         False          False   
544         3          1        2        0          True          False   

     basement_yes  hotwaterheating_yes  airconditioning_yes  

#### Renaming columns

In [24]:
# syntax: data = data.rename(columns = {'old_column_name': 'new_column_name'})
data = data.rename(columns = {'price': 'house_price'})

In [25]:
data

Unnamed: 0,house_price,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,True,False,False,False,True,True,False,False
1,12250000,8960,4,4,4,3,True,False,False,False,True,False,False,False
2,12250000,9960,3,2,2,2,True,False,True,False,False,True,True,False
3,12215000,7500,4,2,2,3,True,False,True,False,True,True,False,False
4,11410000,7420,4,1,2,2,True,True,True,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,2,True,False,True,False,False,False,False,True
541,1767150,2400,3,1,1,0,False,False,False,False,False,False,True,False
542,1750000,3620,2,1,1,0,True,False,False,False,False,False,False,True
543,1750000,2910,3,1,1,0,False,False,False,False,False,False,False,False


In [26]:
data = data.rename(columns = {'price': 'house_price', 'bedrooms': 'number of bedrooms'})
data

Unnamed: 0,house_price,area,number of bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,True,False,False,False,True,True,False,False
1,12250000,8960,4,4,4,3,True,False,False,False,True,False,False,False
2,12250000,9960,3,2,2,2,True,False,True,False,False,True,True,False
3,12215000,7500,4,2,2,3,True,False,True,False,True,True,False,False
4,11410000,7420,4,1,2,2,True,True,True,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,2,True,False,True,False,False,False,False,True
541,1767150,2400,3,1,1,0,False,False,False,False,False,False,True,False
542,1750000,3620,2,1,1,0,True,False,False,False,False,False,False,True
543,1750000,2910,3,1,1,0,False,False,False,False,False,False,False,False


#### Selecting columns based on data type

In [27]:
# For example suppose that you want to select only the numerical columns or categorical columns
# use the function select_dtypes()

In [17]:
# 1. selecting numerical columns:
numerical_columns = data.select_dtypes(include = ['int64', 'float64'])
numerical_columns

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
0,13300000,7420,4,2,3,2
1,12250000,8960,4,4,4,3
2,12250000,9960,3,2,2,2
3,12215000,7500,4,2,2,3
4,11410000,7420,4,1,2,2
...,...,...,...,...,...,...
540,1820000,3000,2,1,1,2
541,1767150,2400,3,1,1,0
542,1750000,3620,2,1,1,0
543,1750000,2910,3,1,1,0


In [18]:
# 2. selecting categorical columns:
categorical_columns = data.select_dtypes(include=['object']) 
categorical_columns
# Data types that are typically considered object in pandas including:
  # strings (text data)
  # mixed datatypes (columns containing a mix of numbers and strings)
  #Python objects (like lists, dictionaries, or other complex types)

0
1
2
3
4
...
540
541
542
543
544


In [19]:
# check the data types of each column 
data.dtypes

price                              int64
area                               int64
bedrooms                           int64
bathrooms                          int64
stories                            int64
parking                            int64
mainroad_yes                        bool
guestroom_yes                       bool
basement_yes                        bool
hotwaterheating_yes                 bool
airconditioning_yes                 bool
prefarea_yes                        bool
furnishingstatus_semi-furnished     bool
furnishingstatus_unfurnished        bool
dtype: object

#### Reordering Columns

In [39]:
# new_order
new_order = ['area', 'bedrooms', 'price', 'bathrooms']
data_new_order = data[new_order]

In [40]:
print(data_new_order.columns) 
# check the columns after Encoding

Index(['area', 'bedrooms', 'price', 'bathrooms'], dtype='object')


In [41]:
data.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking',
       'mainroad_yes', 'guestroom_yes', 'basement_yes', 'hotwaterheating_yes',
       'airconditioning_yes', 'prefarea_yes',
       'furnishingstatus_semi-furnished', 'furnishingstatus_unfurnished'],
      dtype='object')

In [43]:
data_new_order

Unnamed: 0,area,bedrooms,price,bathrooms
0,7420,4,13300000,2
1,8960,4,12250000,4
2,9960,3,12250000,2
3,7500,4,12215000,2
4,7420,4,11410000,1
...,...,...,...,...
540,3000,2,1820000,1
541,2400,3,1767150,1
542,3620,2,1750000,1
543,2910,3,1750000,1


#### Filtering rows based on column values

In [79]:
# Filtering rows where a column meet a condition 
filtered_data = data[data['price'] > 500000]

In [80]:
print(filtered_data)

     area  bedrooms     price  bathrooms
0    7420         4  13300000          2
1    8960         4  12250000          4
2    9960         3  12250000          2
3    7500         4  12215000          2
4    7420         4  11410000          1
..    ...       ...       ...        ...
540  3000         2   1820000          1
541  2400         3   1767150          1
542  3620         2   1750000          1
543  2910         3   1750000          1
544  3850         3   1750000          1

[545 rows x 4 columns]


### Feature Scaling:

In [36]:
# By scaling our features, we ensure that our linear regression model works more efficiently and gives us more accurate predictions. 
# Without scaling, features with larger ranges, like 'area', would dominate the model, and we might end up with skewed results. 
# After scaling, each feature has an equal chance of influencing the model’s predictions.

# two common method to scale the features: Normalization vs. Standardization

#### Normalization

In [20]:
# Normalization rescales the values of the features so that they fall within a specific range, typically between 0 and 1.
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler to the data and transform it
X_normalized = scaler.fit_transform(X)
# I do not run normalization cell, because I have already considered Linear Regression, and in this machine learning, standardization is prefered.
X_normalized

array([[0.39656357, 0.6       , 0.33333333, ..., 1.        , 0.        ,
        0.        ],
       [0.5024055 , 0.6       , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.57113402, 0.4       , 0.33333333, ..., 1.        , 1.        ,
        0.        ],
       ...,
       [0.13539519, 0.2       , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.08659794, 0.4       , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.15120275, 0.4       , 0.        , ..., 0.        , 0.        ,
        1.        ]])

#### Standardization

In [38]:
# Standardization scales the data to have a mean of 0 and a standard deviation of 1. It transforms the data based on the z-score.

In [21]:
#Implementing Standardization in the code
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to the data and transform it
X_scaled =scaler.fit_transform(X)

In [22]:
print(X_scaled)

[[ 1.04672629  1.40341936  1.42181174 ...  1.80494113 -0.84488844
  -0.6964292 ]
 [ 1.75700953  1.40341936  5.40580863 ... -0.55403469 -0.84488844
  -0.6964292 ]
 [ 2.21823241  0.04727831  1.42181174 ...  1.80494113  1.18358821
  -0.6964292 ]
 ...
 [-0.70592066 -1.30886273 -0.57018671 ... -0.55403469 -0.84488844
   1.43589615]
 [-1.03338891  0.04727831 -0.57018671 ... -0.55403469 -0.84488844
  -0.6964292 ]
 [-0.5998394   0.04727831 -0.57018671 ... -0.55403469 -0.84488844
   1.43589615]]


In [None]:
# Positive values are above the mean, negative values are below the mean, and values close to 0 are near the mean.

In [None]:
# Standardization doesn't limit values to between -1 and 1; it just scales them based on how they compare to the average.

## Split the data into training and testing sets

In [23]:
#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [17]:
# Parameters:
# X_scaled: The scaled features (input data) after standardization.
# y: The target variable (what you are trying to predict, like house prices).
# test_size=0.2: This means 20% of the data will be used for testing, and 80% will be used for training the model.
# random_state=42: This is a seed for randomization. Setting it ensures that the split is the same every time you run the code (for reproducibility).

In [None]:
# Output Variables:
# X_train: The training portion of the input data (80% of X_scaled).
# X_test: The testing portion of the input data (20% of X_scaled).
# y_train: The training portion of the target data (80% of y).
# y_test: The testing portion of the target data (20% of y).

In [24]:
print('x_train:\n',X_train, '\nX_test:\n', X_test, '\ny_train:\n', y_train , '\ny_test:\n', y_test)

x_train:
 [[ 0.3917898   0.04727831  1.42181174 ... -0.55403469 -0.84488844
  -0.6964292 ]
 [ 0.94525725  0.04727831  1.42181174 ... -0.55403469  1.18358821
  -0.6964292 ]
 [-0.61552098 -1.30886273 -0.57018671 ... -0.55403469 -0.84488844
  -0.6964292 ]
 ...
 [-0.30004453  0.04727831  1.42181174 ... -0.55403469 -0.84488844
  -0.6964292 ]
 [-0.51220705 -1.30886273 -0.57018671 ... -0.55403469 -0.84488844
   1.43589615]
 [ 0.16117836  0.04727831  1.42181174 ... -0.55403469  1.18358821
  -0.6964292 ]] 
X_test:
 [[ 0.34566751  1.40341936  1.42181174 ... -0.55403469 -0.84488844
   1.43589615]
 [ 0.62240124  0.04727831  1.42181174 ...  1.80494113 -0.84488844
  -0.6964292 ]
 [-0.51220705 -1.30886273 -0.57018671 ... -0.55403469  1.18358821
  -0.6964292 ]
 ...
 [ 0.3917898   1.40341936  1.42181174 ... -0.55403469  1.18358821
  -0.6964292 ]
 [ 0.3917898   0.04727831  1.42181174 ... -0.55403469  1.18358821
  -0.6964292 ]
 [ 0.43791208  0.04727831  1.42181174 ...  1.80494113 -0.84488844
  -0.6964292

## Train the model

In [25]:
# Linear Regression Model
model = LinearRegression() # the model learns the relationship between the input features and the target, allowing it to make predictions later.
# training the model
model.fit(X_train, y_train) # we are trianing the model on the training data(X_train, y_train)

In [102]:
#from sklearn.tree import DecisionTreeRegressor
#model = DecisionTreeRegressor(max_depth=5, min_samples_split=10)
#model.fit(X_train, y_train) 

## Intercept vs. Coefficients in Linear Regresion

In [26]:
print('Intercept:', model.intercept_) 
#when all the features are 0, the base price is 4737518.175380117.

Intercept: 4737518.175380117


In [None]:
# explanation: when all features (like area, bedrooms, etc) are zero, the base value of the house price is 4,766,729.25 units of currency (e.g., dollars, euros).

In [27]:
print('Coefficient:\n', model.coef_)

# positive coefficient
# negative coefficient

Coefficient:
 [ 511615.56377666   56615.57245779  549420.50124098  353158.42985604
  193542.78167455  128151.92129533   88590.21346152  186194.15050566
  143233.20624958  367817.89491558  267018.66081239  -62550.29721128
 -193987.7810882 ]


## Predict on the test data

Do not predict y_test, if you predict y_test, you will ger an error:

The following error says that the structure of y_test has to get reshaped because: 

        Expected 2D array, got 1D array instead:\narray={}.\n"
        
        "Reshape your data either using array.reshape(-1, 1) if "
        
        "your data has a single feature or array.reshape(1, -1) "
        
         "if it contains a single sample.".format(array)

Generally, y_test shouldn't be predicted. 

y_test is used to evaluate how accurate the predictions are by comparing them to the actual values.

You predict y_pred (the predicted values)
and then compare them with y_test to evaluate your model's accuracy.

In [122]:
#prediction on y_test data causes an error
y_pred = model.predict(y_test) #incorrect

ValueError: Expected 2D array, got 1D array instead:
array=[ 4060000  6650000  3710000  6440000  2800000  4900000  5250000  4543000
  2450000  3353000 10150000  2660000  3360000  3360000  2275000  2660000
  2660000  7350000  2940000  2870000  6720000  5425000  1890000  5250000
  4193000 12250000  3080000  5110000  9800000  2520000  6790000  3500000
  6650000  2940000  3325000  4200000  4900000  3290000  3500000  2380000
  5495000  3675000  6650000  4907000  3150000  4480000  6580000  5740000
  3003000  1820000  8400000  2450000  4270000  4007500  3234000  1750000
  9800000  2100000  4340000  3045000  3850000  3500000  4753000  3080000
  4550000  6510000  6685000  5110000  4550000  6650000  3640000  5600000
  3780000  6615000  3220000  6650000  4690000  4830000  6860000  2233000
  7035000  4165000  6195000  6510000  1890000  8190000  2660000  4193000
 13300000  9681000  4340000  8645000  3703000  5145000  6440000  5950000
  5810000  5740000  6230000  5600000  3010000  8890000  4900000  5530000
  9100000  3773000  7910000  7350000  5530000].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [27]:
# a very important explanation:
  # The model applies the learned relationship between features and the target (from training) to predict the house prices for these unseen test data points.

In [28]:
y_pred = model.predict(X_test) # For model evaluation and testing

In [29]:
y_pred # on X_test

array([5164653.90033967, 7224722.29802166, 3109863.24240338,
       4612075.3272256 , 3294646.25725956, 3532275.09556558,
       5611774.56836474, 6368145.98732718, 2722856.95689986,
       2629405.61585784, 9617039.50315578, 2798087.30447888,
       3171096.76847064, 3394639.09125529, 3681088.65424275,
       5263187.74621486, 3035963.47612386, 4786122.8004005 ,
       4349551.9200572 , 3572362.09930451, 5774875.2139565 ,
       5886993.57919883, 2730836.19518459, 4727316.47323636,
       5244847.52716799, 7555324.21605601, 3220790.84680269,
       5191898.79934207, 8143726.91009782, 3398814.09825036,
       6490693.05027925, 3315105.90747811, 6708457.36761325,
       4201738.21071676, 3557571.06735186, 5836974.50478626,
       4808660.67448475, 4362878.73613262, 3191242.95701509,
       4596554.93225239, 4566042.86048409, 3517779.52374149,
       7205844.79365835, 3983597.27861103, 3749338.70271055,
       4274731.09125895, 6757442.10783741, 4037320.43665851,
       3769334.90397125,

In [61]:
y_test.array.reshape(-1, 1) #the actual value for comparing with y_pred = model.predict(X_test)

<PandasArray>
[
[4060000],
[6650000],
[3710000],
[6440000],
[2800000],
[4900000],
[5250000],
[4543000],
[2450000],
[3353000],
[10150000],
[2660000],
[3360000],
[3360000],
[2275000],
[2660000],
[2660000],
[7350000],
[2940000],
[2870000],
[6720000],
[5425000],
[1890000],
[5250000],
[4193000],
[12250000],
[3080000],
[5110000],
[9800000],
[2520000],
[6790000],
[3500000],
[6650000],
[2940000],
[3325000],
[4200000],
[4900000],
[3290000],
[3500000],
[2380000],
[5495000],
[3675000],
[6650000],
[4907000],
[3150000],
[4480000],
[6580000],
[5740000],
[3003000],
[1820000],
[8400000],
[2450000],
[4270000],
[4007500],
[3234000],
[1750000],
[9800000],
[2100000],
[4340000],
[3045000],
[3850000],
[3500000],
[4753000],
[3080000],
[4550000],
[6510000],
[6685000],
[5110000],
[4550000],
[6650000],
[3640000],
[5600000],
[3780000],
[6615000],
[3220000],
[6650000],
[4690000],
[4830000],
[6860000],
[2233000],
[7035000],
[4165000],
[6195000],
[6510000],
[1890000],
[8190000],
[2660000],
[4193000],
[13300000],
[9

In [126]:
print(X_test.shape)
print(y_test.shape)

(109, 13)
(109,)


In [125]:
print(y_pred.shape)
print(X_test.shape)

(109,)
(109, 13)


# Model Evaluation

In [30]:
y_test

316    4060000
77     6650000
360    3710000
90     6440000
493    2800000
        ...   
15     9100000
357    3773000
39     7910000
54     7350000
155    5530000
Name: price, Length: 109, dtype: int64

In [31]:
y_train

46     7525000
93     6300000
335    3920000
412    3430000
471    3010000
        ...   
71     6755000
106    6160000
270    4340000
435    3290000
102    6195000
Name: price, Length: 436, dtype: int64

In [32]:
# Calculate MSE, MAE, R2, RMSE for training data
mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse}")

MSE: 1754318687330.6682


In [33]:
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")

MAE: 970043.4039201641


In [34]:
r2 = r2_score(y_test, y_pred)
print("R2:", r2)

R2: 0.6529242642153175


In [35]:
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")

RMSE: 1324506.9600914402


In [36]:
# Adjusted R-squared Calculation
n = X_test.shape[0]  # number of samples
p = X_test.shape[1]  # number of predictors (features)
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

print(f"Adjusted R-squared: {adjusted_r2}")

Adjusted R-squared: 0.605429689844782


In [38]:
mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse}")
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
r2 = r2_score(y_test, y_pred)
print("R2:", r2)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")
n = X_test.shape[0]  # number of samples
p = X_test.shape[1]  # number of predictors (features)
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print(f"Adjusted R-squared: {adjusted_r2}")

MSE: 1754318687330.6682
MAE: 970043.4039201641
R2: 0.6529242642153175
RMSE: 1324506.9600914402
Adjusted R-squared: 0.605429689844782


In [39]:
print("Test Data Metrics:")
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"R2: {r2}")
print(f"RMSE: {rmse}")
print(f"Adjusted R-squared: {adjusted_r2}")

Test Data Metrics:
MSE: 1754318687330.6682
MAE: 970043.4039201641
R2: 0.6529242642153175
RMSE: 1324506.9600914402
Adjusted R-squared: 0.605429689844782


In [95]:
data_dropdup=data.drop_duplicates(inplace=True)
data_dropdup

To check for overfitting and underfitting in the housing dataset (or any other dataset), you need to evaluate the model's performance on both the training data and the testing data.

In [40]:
# Predictions on training data
y_pred_train = model.predict(X_train) # For checking training performance

In [41]:
y_pred_train

array([ 7109165.83869914,  6882951.80434331,  4815249.78634875,
        3990061.64537982,  3238924.34106256,  4213259.45560891,
        3262521.2215167 ,  2776422.43321996,  2329025.57980948,
        3653860.17702972,  4084008.35268491,  3686745.82238176,
        4686517.16289527,  3873464.37706931,  4190240.83409354,
        5869519.40313553,  2431733.15691031,  5789740.18200832,
        3330477.18628315,  7871347.11276757,  6464175.05153336,
        3348710.98229963,  5066745.68311879,  6829041.62324657,
        6441622.72524018,  3215327.46060842,  2359612.68424144,
        3177203.1918561 ,  5577561.12144359,  3119493.38199004,
        4101305.32597451,  2612119.22569568,  3901239.29674703,
        5228986.06476602,  4122403.48334852,  4005975.25747494,
        2429223.48158115,  6896328.29436496,  5159484.34408524,
        6662050.04903443,  6939841.61654542,  5455918.56569496,
        4382276.61245577,  3950285.29106802,  3778024.21928121,
        3318897.97407655,  7075245.51564

In [42]:
mse_train = mean_squared_error(y_train, y_pred_train)
print(f"MSE_train: {mse_train}")
mae_train = mean_absolute_error(y_train, y_pred_train)
print(f"MAE_train: {mae_train}")
r2_train = r2_score(y_train, y_pred_train)
print("R2_train:", r2_train)
rmse_train = np.sqrt(mse_train)
print(f"RMSE_train: {rmse_train}")
n = X_train.shape[0]  # number of samples
p = X_train.shape[1]  # number of predictors (features)
adjusted_r2_train = 1 - (1 - r2_train) * (n - 1) / (n - p - 1)
print(f"Adjusted R-squared_train: {adjusted_r2_train}")

MSE_train: 968358188440.7242
MAE_train: 719242.8936724712
R2_train: 0.6859438988560158
RMSE_train: 984051.9236507412
Adjusted R-squared_train: 0.676269184839732


In [43]:
# Check for overfitting or underfitting
if mse_train < mse:
    print("The model may be overfitting (lower training error, higher test error).")
elif mse_train > mse:
    print("The model may be underfitting (higher training error, lower test error).")
else:
    print("The model generalizes well (similar performance on training and test data).")

The model may be overfitting (lower training error, higher test error).


#### Notice: the code above, correctly detects overfitting based on the difference between training and test MSE values (which is common).

Since the difference is not severe, so the model is not performing poorly.

result shows well generalization of linear regression model in the housing dataset.

In [None]:
# Now I want to provide a program that calculates the difference between the training data metrics and test data metrics to assess overfitting more precisely.

In [44]:
print("Test Data Metrics:")
print(f"MSE: {mse}")
print(f"MSE_train: {mse_train}")
print(f"MAE: {mae}")
print(f"MAE_train: {mae_train}")
print(f"R2: {r2}")
print("R2_train:", r2_train)
print(f"RMSE: {rmse}")
print(f"RMSE_train: {rmse_train}")
print(f"Adjusted R-squared: {adjusted_r2}")
print(f"Adjusted R-squared_train: {adjusted_r2_train}")

Test Data Metrics:
MSE: 1754318687330.6682
MSE_train: 968358188440.7242
MAE: 970043.4039201641
MAE_train: 719242.8936724712
R2: 0.6529242642153175
R2_train: 0.6859438988560158
RMSE: 1324506.9600914402
RMSE_train: 984051.9236507412
Adjusted R-squared: 0.605429689844782
Adjusted R-squared_train: 0.676269184839732


In [47]:
# Assuming these are your metrics from the training and test datasets:
mse_train = 968358188440.7242
mse = 1754318687330.6682

mae_train = 719242.8936724712
mae = 970043.4039201641

rmse_train = 984051.9236507412
rmse = 1324506.9600914402

# Set threshold for acceptable difference (you can adjust this value)
mse_threshold = 0.2  # 20% difference
mae_threshold = 0.2  # 20% difference
rmse_threshold = 0.2  # 20% difference

# This part calculates the percentage difference between the test and training errors for each metric (MSE, MAE, RMSE).
mse_diff = (mse - mse_train) / mse_train
mae_diff = (mae - mae_train) / mae_train
rmse_diff = (rmse - rmse_train) / rmse_train

# Check if the difference exceeds the threshold
if mse_diff > mse_threshold:
    print(f"Overfitting detected based on MSE. MSE difference is {mse_diff*100:.2f}%")
else:
    print(f"MSE difference is within acceptable range. MSE difference is {mse_diff*100:.2f}%")

if mae_diff > mae_threshold:
    print(f"Overfitting detected based on MAE. MAE difference is {mae_diff*100:.2f}%")
else:
    print(f"MAE difference is within acceptable range. MAE difference is {mae_diff*100:.2f}%")

if rmse_diff > rmse_threshold:
    print(f"Overfitting detected based on RMSE. RMSE difference is {rmse_diff*100:.2f}%")
else:
    print(f"RMSE difference is within acceptable range. RMSE difference is {rmse_diff*100:.2f}%")


Overfitting detected based on MSE. MSE difference is 81.16%
Overfitting detected based on MAE. MAE difference is 34.87%
Overfitting detected based on RMSE. RMSE difference is 34.60%


In [None]:
# interpretting: The result of the code shows that your model is indeed overfitting based on the MSE, MAE, and RMSE differences between the training and test sets.
# The high percentage differences (81% for MSE, 34% for MAE, and 34% for RMSE) indicate that overfitting is occurring.

In [None]:
# to reduce overfitting and improve the model's generalization to unseen data, you can apply the following strategies:
# 1. Regularization (L1 or L2)
# 2. Cross-Validation
# 3. Reduce Model Complexity
# 4. Increase Training Data
# 5. Early Stopping (for Iterative Models)

## To address overfitting, first use feature selection method

#### 1. Use techniques like recursive feature elimination (RFE) to reduce the number of features.

Recursive feature elimination (RFE) technique helps us to select the most important features in the dataset, in this case house price dataset.

In other words, we are transforming our dataset using Recursive Feature Elimination (RFE) to reduce the number of features based on their importance.

In [49]:
from sklearn.feature_selection import RFE
selector = RFE(model, n_features_to_select=5)
selector.fit(X_train, y_train)

#### 2. Access the Selected Features:

After running RFE, the selector object will contain information about which features were selected. You can use the support_ attribute to see which features were selected.

In [50]:
# Get the boolean mask indicating which features were selected
selected_features = selector.support_
print("Selected features:", selected_features)

# Alternatively, get the ranking of all features (1 = selected)
feature_ranking = selector.ranking_
print("Feature ranking:", feature_ranking)


Selected features: [ True False  True  True False False False False False  True  True False
 False]
Feature ranking: [1 9 1 1 3 6 7 2 5 1 1 8 4]


### Let's interpret this output:

High rank like 9 means less importance meaning it contributes the less to the model's predictive performance.

Rank 1 means it contributes the high to the model's predictive performance.

In selected features, True means the feature is selected, and False meanse the feature is not selected. 

As you see in the outout, all feature with rank 1 are selected to be eliminated because they have high contribution to the model's predictive performance.

#### 3. Transform the Training and Test Data:

You need to transform both your training and test data to include only the selected features. The selector.transform() method will reduce your datasets to include only the top 5 features selected by RFE.

In [51]:
# Transform X_train and X_test to include only the selected features
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

In [52]:
X_train_selected

array([[ 0.3917898 ,  1.42181174,  2.53202371,  1.4726183 , -0.55403469],
       [ 0.94525725,  1.42181174, -0.92939666,  1.4726183 , -0.55403469],
       [-0.61552098, -0.57018671, -0.92939666,  1.4726183 , -0.55403469],
       ...,
       [-0.30004453,  1.42181174,  1.37821692, -0.67906259, -0.55403469],
       [-0.51220705, -0.57018671, -0.92939666, -0.67906259, -0.55403469],
       [ 0.16117836,  1.42181174,  2.53202371,  1.4726183 , -0.55403469]])

In [53]:
X_test_selected

array([[ 0.34566751,  1.42181174,  0.22441013, -0.67906259, -0.55403469],
       [ 0.62240124,  1.42181174,  1.37821692,  1.4726183 ,  1.80494113],
       [-0.51220705, -0.57018671, -0.92939666, -0.67906259, -0.55403469],
       [-0.06943308, -0.57018671,  0.22441013,  1.4726183 , -0.55403469],
       [-0.54910488, -0.57018671, -0.92939666, -0.67906259, -0.55403469],
       [ 0.72387027, -0.57018671, -0.92939666, -0.67906259, -0.55403469],
       [ 1.55407146, -0.57018671, -0.92939666,  1.4726183 , -0.55403469],
       [-0.07404531,  1.42181174,  0.22441013, -0.67906259,  1.80494113],
       [-0.88118536, -0.57018671, -0.92939666, -0.67906259, -0.55403469],
       [-1.13024571, -0.57018671, -0.92939666, -0.67906259, -0.55403469],
       [ 1.58174483,  3.41381019,  2.53202371,  1.4726183 ,  1.80494113],
       [-0.56109668, -0.57018671, -0.92939666, -0.67906259, -0.55403469],
       [-0.65979837, -0.57018671, -0.92939666,  1.4726183 , -0.55403469],
       [-0.94575656, -0.57018671,  0.2

Now, X_train_selected and X_test_selected will contain only the top 5 features chosen by RFE.

#### 4. Train the Model on the Selected Features:

Use the transformed training data (X_train_selected) to re-train your model.

In [54]:
# Re-train the model using only the selected features
model.fit(X_train_selected, y_train)

#### 5. Predict Using the Test Set:

Use the transformed test set (X_test_selected) to make predictions and evaluate the model’s performance.

In [56]:
# Make predictions using the test data with selected features
y_test_pred = model.predict(X_test_selected)

#### 6. Evaluate the Model:

After making predictions on the test set, evaluate the model using metrics like MSE, MAE, R², etc., to see how well the model performs with the reduced feature set.

In [57]:
from sklearn.metrics import mean_squared_error, r2_score

# Calculate and print evaluation metrics
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"Test MSE: {mse_test}")
print(f"Test R²: {r2_test}")

Test MSE: 2153301845033.9512
Test R²: 0.5739891345689093


In [45]:
from sklearn.linear_model import Lasso
# Initialize Lasso with alpha (regularization strength)
lasso = Lasso(alpha=0.1)
# Fit the model
lasso.fit(X_train, y_train)
# Make predictions
y_pred = lasso.predict(X_test)


In [42]:
y_pred

array([5164653.95732913, 7224721.81130069, 3109863.61335579,
       4612075.54582379, 3294646.15787928, 3532275.36045977,
       5611774.30152193, 6368145.37467925, 2722857.25557328,
       2629405.62913828, 9617039.19490542, 2798087.6292849 ,
       3171097.14434603, 3394639.36500698, 3681088.77166169,
       5263187.84354423, 3035963.8089764 , 4786122.92558858,
       4349551.9834689 , 3572362.45318035, 5774875.14460856,
       5886993.49701932, 2730836.52247368, 4727316.32931929,
       5244847.59735763, 7555324.05211551, 3220790.96716643,
       5191898.36862588, 8143726.30209052, 3398813.98699938,
       6490692.96104275, 3315106.11023633, 6708456.91796928,
       4201738.46772225, 3557571.2576798 , 5836974.61721096,
       4808660.66159423, 4362878.77231771, 3191243.29934278,
       4596554.67954893, 4566043.08621509, 3517779.79871808,
       7205844.30763438, 3983597.08363864, 3749339.050052  ,
       4274731.30896111, 6757442.13394536, 4037320.6376049 ,
       3769335.19291648,

In [46]:
train_mse = mean_squared_error(y_train, y_pred)
train_mae = mean_absolute_error(y_train, y_pred)
train_r2 = r2_score(y_train, y_pred)
train_rmse = np.sqrt(train_mse)

ValueError: Found input variables with inconsistent numbers of samples: [436, 109]

In [44]:
train_mse
train_mae
train_r2 
train_rmse 

984051.9236507412