In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, ElasticNetCV
from sklearn.metrics import classification_report, f1_score

In [2]:
# Loading and reading the dataset
data = pd.read_csv("Energy_Efficiency_Overfit_Dataset_Updated.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Wall_Area                 200 non-null    float64
 1   Roof_Area                 200 non-null    float64
 2   Window_Area               200 non-null    float64
 3   Overall_Height            200 non-null    float64
 4   Outdoor_Temperature       200 non-null    float64
 5   Humidity                  200 non-null    float64
 6   Energy_Efficiency_Rating  200 non-null    float64
 7   Noise_Feature_1           200 non-null    float64
 8   Noise_Feature_2           200 non-null    float64
 9   Noise_Feature_3           200 non-null    float64
 10  Noise_Feature_4           200 non-null    float64
 11  Noise_Feature_5           200 non-null    float64
 12  Noise_Feature_6           200 non-null    float64
 13  Noise_Feature_7           200 non-null    float64
 14  Noise_Feat

### Prepare the Target Variable



Our target variable is 'Energy Efficiency Rating', which is a floating point. We'll convert this into a binary target variable using the median of the 'Energy Efficiency Rating' as a splitting criteria. The splitting criteria sets the boundary between the classes. The median of the 'Energy efficiency rating' will categorize buildings as energy-efficient or not. 


In [4]:
# Calculate and print the median
rating_split = data['Energy_Efficiency_Rating'].median()

In [5]:
print("Threshold (Median):", rating_split )

Threshold (Median): 146.310167005691


In [6]:
data["Target"] = data['Energy_Efficiency_Rating'].apply(lambda x: 1 if x >= rating_split else 0)
data.drop(columns = ["Energy_Efficiency_Rating"], inplace = True) # Dropping the reduntant column which has information on target
data.head()

Unnamed: 0,Wall_Area,Roof_Area,Window_Area,Overall_Height,Outdoor_Temperature,Humidity,Noise_Feature_1,Noise_Feature_2,Noise_Feature_3,Noise_Feature_4,...,Noise_Feature_9,Noise_Feature_10,Orientation_East,Orientation_North,Orientation_South,Orientation_West,Glazing_Type_Type_A,Glazing_Type_Type_B,Glazing_Type_Type_C,Target
0,388.202617,188.924545,44.013461,3.449571,13.102177,51.125928,0.989088,0.904051,0.791454,0.339118,...,0.288525,0.518964,0,1,0,0,0,1,0,1
1,320.00786,192.818625,38.84103,5.417319,14.283884,53.690759,0.818101,0.03522,0.623867,0.277418,...,0.571153,0.052572,1,0,0,0,0,0,1,1
2,348.936899,232.989788,57.666632,4.055632,8.869296,48.129687,0.340605,0.180661,0.439745,0.96228,...,0.914694,0.682054,0,0,1,0,0,0,1,1
3,412.04466,219.657912,53.562928,5.238103,9.242672,50.771139,0.152047,0.338514,0.010586,0.352407,...,0.738639,0.268888,0,0,1,0,0,1,0,1
4,393.3779,219.203946,32.314615,3.594037,13.584071,79.116164,0.784059,0.577496,0.964928,0.894173,...,0.856666,0.106768,0,1,0,0,0,1,0,1


### Splitting the Dataset into Train and Test Sets


In [7]:
X = data.drop('Target', axis = 1)
y = data['Target']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

In [9]:
# We would like to select the random_state in which the difference between the mean of y_train and y_test is minimal.
# Experiment with the values of random_state.
y_train.mean(), y_test.mean()

(0.4928571428571429, 0.5166666666666667)

### Training and Evaluating the Base Model (No Regularization)

In [10]:
# Logistic regression without regularization
logistic_no_reg = LogisticRegression(penalty=None,
                                     max_iter=10000,
                                     n_jobs=-1)
logistic_no_reg.fit(X_train, y_train)

# keeping the value of max_iter as 10000, to avoid the Convergence Warning

### Inspecting model coefficients 

In [11]:
logistic_no_reg.coef_

array([[   604.79436967,    178.75221199,    516.37813663,
         -4348.76419445,   -496.53822731,    246.93292561,
         -7708.57922324,  -5772.27753482,   8021.51709964,
         -9900.44587459,   -307.75237776,   5662.30275687,
        -17232.74546316,  21004.62160308, -14978.49756665,
        -12035.9669258 , -26451.94190708, -26389.26360348,
        -29442.00136835, -52742.4524461 , -44693.88245923,
        -45695.53492406, -44636.24194224]])

### Model Performance

In [12]:
# Evaluate the model without regularization for train set
y_train_pred_no_reg = logistic_no_reg.predict(X_train)
f1_no_reg = f1_score(y_train, y_train_pred_no_reg)
print("F1 Score without Regularization on train set:", f1_no_reg)

F1 Score without Regularization on train set: 1.0


In [13]:
# Evaluate the model without regularization for test data
y_test_pred_no_reg = logistic_no_reg.predict(X_test)
f1_no_reg = f1_score(y_test, y_test_pred_no_reg)
print("F1 Score without Regularization on test set:", f1_no_reg)

F1 Score without Regularization on test set: 0.8421052631578947


# Apply Logistic Regression with L1 Regularization 

In [14]:
# Defining a range for Cs
Cs = np.linspace(0.001,10,20)

In [15]:
# Logistic regression with L1 regularization using cross-validation to find the best C
logistic_l1_cv = LogisticRegressionCV(Cs=Cs, 
                                      penalty='l1', 
                                      solver='liblinear', 
                                      cv=5,
                                      max_iter=10000,
                                      scoring = 'f1',
                                      n_jobs=-1)
logistic_l1_cv.fit(X_train, y_train)

#### Let's understand this function: 
1. The _Cs_ parameter specifies the range of values for the parameter C. In this case, we have kept the range from 0.001 to 10 with evenly spaced 20 samples. 

2. __penalty__ : To the parameter ‘penalty’ we specify l1. 

3. __solver:__ The solver parameter specifies the algorithm to use in the optimization problem. It's important to choose the right solver based on the size of your data and the type of regularization you are using. 
For instance, liblinear works for btth L1 and L2 penalty and that is what we will be using when we are applying these panalties. 

4. __cv__ i.e., We are using 5 folds cross validaiton here to select the hyperparameter. 
5. __max_iter=10000__: Sets the maximum number of iterations for the solver.

6. __n_jobs=-1__: The primary purpose of n_jobs is to enable parallel computing. Setting it to -1 tells the function to use all available cores on the machine. This means the computation will utilize as much of the CPU resource as possible to perform tasks in parallel, thereby speeding up the training process.


### Examine different vlaues of C

In [16]:
logistic_l1_cv.Cs

array([1.00000000e-03, 5.27263158e-01, 1.05352632e+00, 1.57978947e+00,
       2.10605263e+00, 2.63231579e+00, 3.15857895e+00, 3.68484211e+00,
       4.21110526e+00, 4.73736842e+00, 5.26363158e+00, 5.78989474e+00,
       6.31615789e+00, 6.84242105e+00, 7.36868421e+00, 7.89494737e+00,
       8.42121053e+00, 8.94747368e+00, 9.47373684e+00, 1.00000000e+01])

### Best value of C

In [17]:
best_C = logistic_l1_cv.C_
print(f"The best Cs value is: {best_C}")

The best Cs value is: [2.63231579]


The best C gets automatically saved in the model instance.

### Model Performance

In [18]:
# Evaluate the model with L1 regularization on train set
y_train_pred_l1 = logistic_l1_cv.predict(X_train)
f1_l1 = f1_score(y_train, y_train_pred_l1)
print("F1 Score with L1 Regularization on train set:", f1_l1)

F1 Score with L1 Regularization on train set: 0.9705882352941176


In [19]:
# Evaluate the model with L1 regularization on test set
y_test_pred_l1 = logistic_l1_cv.predict(X_test)
f1_l1 = f1_score(y_test, y_test_pred_l1)
print("F1 Score with L1 Regularization on test set:", f1_l1)

F1 Score with L1 Regularization on test set: 0.8813559322033899


- Overfitting has reduced with L1 regularization. 
- Significant difference between Train and Test set scores, indicating Overfitting.

### Name of all features with coefficients 

In [20]:
coefficients_after_l1 = pd.DataFrame({
    'Feature': X_train.columns,
    'After L1 Regularization': logistic_l1_cv.coef_.flatten()
})
print(coefficients_after_l1.sort_values(by = 'After L1 Regularization' ))

                Feature  After L1 Regularization
19     Orientation_West                -2.252477
7       Noise_Feature_2                -1.181892
14      Noise_Feature_9                -0.824651
6       Noise_Feature_1                -0.473116
3        Overall_Height                -0.394922
21  Glazing_Type_Type_B                -0.362134
20  Glazing_Type_Type_A                -0.181435
16     Orientation_East                -0.162041
4   Outdoor_Temperature                -0.074879
5              Humidity                -0.036872
11      Noise_Feature_6                 0.000000
18    Orientation_South                 0.000000
17    Orientation_North                 0.000000
15     Noise_Feature_10                 0.000000
22  Glazing_Type_Type_C                 0.000000
12      Noise_Feature_7                 0.000000
9       Noise_Feature_4                 0.000000
1             Roof_Area                 0.033673
2           Window_Area                 0.036372
0             Wall_A

### Features with 0 coefficients 

In [21]:
# Identify zero features from logisitc regression L1 model
zero_mask = (logistic_l1_cv.coef_ == 0).flatten()
zero_features = X_train.columns[zero_mask]
# Print the features with zero coefficients 
zero_features.tolist()

['Noise_Feature_4',
 'Noise_Feature_6',
 'Noise_Feature_7',
 'Noise_Feature_10',
 'Orientation_North',
 'Orientation_South',
 'Glazing_Type_Type_C']

- Coefficients of 7 Features have been reduced to 0

# Apply Logistic Regression with L2 Regularization 



### RUN ALL ABOVE

In [22]:
# Defining a range for Cs 
Cs= np.linspace(0.001,10,20)

In [23]:
# Logistic regression with L2 regularization using cross-validation to find the best C
logistic_l2_cv = LogisticRegressionCV(Cs=Cs, 
                                      penalty='l2', 
                                      solver='liblinear',
                                      cv=5, 
                                      max_iter=10000,
                                      n_jobs=-1)
logistic_l2_cv.fit(X_train, y_train)

### Examine different vlaues of C

In [24]:
logistic_l2_cv.Cs_

array([1.00000000e-03, 5.27263158e-01, 1.05352632e+00, 1.57978947e+00,
       2.10605263e+00, 2.63231579e+00, 3.15857895e+00, 3.68484211e+00,
       4.21110526e+00, 4.73736842e+00, 5.26363158e+00, 5.78989474e+00,
       6.31615789e+00, 6.84242105e+00, 7.36868421e+00, 7.89494737e+00,
       8.42121053e+00, 8.94747368e+00, 9.47373684e+00, 1.00000000e+01])

### Best value of C

In [25]:
best_C = logistic_l2_cv.C_
print(f"The best Cs value is: {best_C}")

The best Cs value is: [5.78989474]


### Model Performance

In [26]:
# Evaluate the model with L2 regularization on train data
y_train_pred_l2 = logistic_l2_cv.predict(X_train)
f1_l2 = f1_score(y_train, y_train_pred_l2)
print("F1 Score with L2 Regularization on train data:", f1_l2)

F1 Score with L2 Regularization on train data: 0.9154929577464788


In [27]:
# Evaluate the model with L2 regularization on test data
y_test_pred_l2 = logistic_l2_cv.predict(X_test)
f1_l2 = f1_score(y_test, y_test_pred_l2)
print("F1 Score with L2 Regularization on test data:", f1_l2)

F1 Score with L2 Regularization on test data: 0.819672131147541


- L2 regularization further reduces the F1 score on train set, indicating a stronger constraint on the model complexity.
- F1 score on the test set is lower with L2 regularization
- In this specific case, L1 regularization outperforms L2.

### Name of all features with coefficients


In [28]:
coefficients_after_l2 = pd.DataFrame({
    'Feature': X_train.columns,
    'After L2 Regularization': logistic_l2_cv.coef_.flatten()
})
print(coefficients_after_l2.sort_values(by = 'After L2 Regularization' ))

                Feature  After L2 Regularization
19     Orientation_West                -2.832268
21  Glazing_Type_Type_B                -2.541632
20  Glazing_Type_Type_A                -2.176545
7       Noise_Feature_2                -2.064258
14      Noise_Feature_9                -2.054356
22  Glazing_Type_Type_C                -1.374682
12      Noise_Feature_7                -1.198039
17    Orientation_North                -1.184977
18    Orientation_South                -1.074118
9       Noise_Feature_4                -1.004238
16     Orientation_East                -1.001495
3        Overall_Height                -0.667936
6       Noise_Feature_1                -0.598426
11      Noise_Feature_6                -0.233957
15     Noise_Feature_10                -0.197125
4   Outdoor_Temperature                -0.092914
5              Humidity                -0.060982
2           Window_Area                 0.010112
1             Roof_Area                 0.016002
0             Wall_A

### Features with 0 coefficients

In [29]:
# Identify zero features from logisitc regression L2 model
zero_mask = (logistic_l2_cv.coef_ == 0).flatten()
zero_features = X_train.columns[zero_mask]
# Print the features with zero coefficients 
zero_features.tolist()

[]


### Since L1 regularization provides the better balance between training and test performance, it might be the preferred model in this scenario.


# Elastic Net 

### Run  All Above

In [30]:
Cs=np.linspace(0.001,10,20)

In [31]:
# Logistic regression with Elastic Net regularization using cross-validation
logistic_en_cv = LogisticRegressionCV(penalty='elasticnet',
                                      Cs = Cs,
                                      l1_ratios= [0.0001, 0.001, 0.01, 0.05, 0.1, 0.4, 0.5, 0.7, 1], 
                                      solver='saga', 
                                      cv=5,
                                      max_iter=1000000,
                                      n_jobs=-1)
logistic_en_cv.fit(X_train, y_train)

## Lets understand this function in detail:
1. penalty='elasticnet' specifies the use of Elastic Net regularization,where both L1 and L2 penalty terms are added.
2. l1_ratios specifies the ratio between L1 and L2 regularization in the Elastic Net penalty
3. solver='saga' is the optimization algorithm used for solving the regression problem. Elastic-Net penalty is only supported by the saga solver.


### We are trying more ratios closer to 0 becuase we know L1 performed better


### All coefficient values

In [32]:
logistic_en_cv.Cs_

array([1.00000000e-03, 5.27263158e-01, 1.05352632e+00, 1.57978947e+00,
       2.10605263e+00, 2.63231579e+00, 3.15857895e+00, 3.68484211e+00,
       4.21110526e+00, 4.73736842e+00, 5.26363158e+00, 5.78989474e+00,
       6.31615789e+00, 6.84242105e+00, 7.36868421e+00, 7.89494737e+00,
       8.42121053e+00, 8.94747368e+00, 9.47373684e+00, 1.00000000e+01])

### Best Value of C

In [33]:
logistic_en_cv.C_

array([0.001])

### Best Value of l1_ratio

Lets find the best l1 ratio. 

In [34]:
logistic_en_cv.l1_ratio_

array([0.0001])

- Ratio suggested is very close to 0 which means that it is drawing us to L2 penalty. 

### Model Performance

In [35]:
# Evaluate the model
y_train_pred_elastic = logistic_en_cv.predict(X_train)
f1_elastic_train = f1_score(y_train, y_train_pred_elastic)
print("F1 Score with Elastic Net Regularization on Train Set:", f1_elastic_train)

F1 Score with Elastic Net Regularization on Train Set: 0.821917808219178


In [36]:
y_test_pred_elastic = logistic_en_cv.predict(X_test)
f1_elastic_test = f1_score(y_test, y_test_pred_elastic)
print("F1 Score with Elastic Net Regularization on Test Set:", f1_elastic_test)

F1 Score with Elastic Net Regularization on Test Set: 0.7540983606557377


- Elastic Net did not perform as good as either L1 or L2 regularization.
- The F1 score on the train set is 0.82 and on test set is 0.75. 

### It indicates that while Elastic Net is versatile, its effectiveness may vary depending on the specific dataset and problem at hand. 