# Week 3 – Forward and backward selection, PCR, and PLSR.

This notebook focuses on applying forward and backward selection, PCR, and PLSR.

In [1]:
pip install mlxtend


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression


In [3]:
df = pd.read_csv('diabetic_data.csv')
df_map = pd.read_csv('IDS_mapping.csv')
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
df_map.head(9)

Unnamed: 0,admission_type_id,description
0,1.0,Emergency
1,2.0,Urgent
2,3.0,Elective
3,4.0,Newborn
4,5.0,Not Available
5,6.0,
6,7.0,Trauma Center
7,8.0,Not Mapped
8,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [6]:
df.describe()

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,165201600.0,54330400.0,2.024006,3.715642,5.754437,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607
std,102640300.0,38696360.0,1.445403,5.280166,4.064081,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,84961190.0,23413220.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,152389000.0,45505140.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,230270900.0,87545950.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


Check for the unique values of the categorical columns

In [7]:
print("Gender values:", df['gender'].unique())
print("Age values:", df['age'].unique())
print("Admission Type ID values:", df['admission_type_id'].unique())

Gender values: ['Female' 'Male' 'Unknown/Invalid']
Age values: ['[0-10)' '[10-20)' '[20-30)' '[30-40)' '[40-50)' '[50-60)' '[60-70)'
 '[70-80)' '[80-90)' '[90-100)']
Admission Type ID values: [6 1 2 3 4 5 8 7]


Drop unknown gender and only include 1-5 admission types.

In [8]:
df = df[df["gender"] != "Unknown/Invalid"]

valid_ids = [1, 2, 3, 4, 5]
df = df[df["admission_type_id"].isin(valid_ids)]

Select columns to work with and check for nulls.

In [9]:
df_model = df[[
    'time_in_hospital', 'num_lab_procedures', 'num_procedures',
    'num_medications', 'number_diagnoses', 'age', 'gender', 'admission_type_id'
]].copy()

df_model.isnull().sum()

time_in_hospital      0
num_lab_procedures    0
num_procedures        0
num_medications       0
number_diagnoses      0
age                   0
gender                0
admission_type_id     0
dtype: int64

Encode gender and age.

In [10]:
df_model = pd.get_dummies(df_model, columns=["gender", "age"], drop_first=True)

Define x and y

In [11]:
X = df_model.drop(columns='time_in_hospital')
y = df_model['time_in_hospital']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
print(len(df_model.columns))

16


Linear regression with forward selection

In [13]:
model = LinearRegression()

sfs = SFS(model,
          k_features='best',
          forward=True,
          floating=False,
          scoring='r2',
          cv=5)

sfs.fit(X_train, y_train)

selected_features = list(sfs.k_feature_names_)
print("Selected Features:", selected_features)

Selected Features: ['num_lab_procedures', 'num_procedures', 'num_medications', 'number_diagnoses', 'admission_type_id', 'gender_Male', 'age_[10-20)', 'age_[20-30)', 'age_[30-40)', 'age_[40-50)', 'age_[50-60)', 'age_[60-70)', 'age_[70-80)', 'age_[80-90)', 'age_[90-100)']


In [14]:
X_train_for = X_train[selected_features]
X_test_for = X_test[selected_features]

In [15]:
model_for = LinearRegression()
model_for.fit(X_train_for, y_train)

In [16]:
y_pred_for = model_for.predict(X_test_for)

rmse_for = np.sqrt(mean_squared_error(y_test, y_pred_for))
r2_for = r2_score(y_test, y_pred_for)

print("Backward RMSE:", rmse_for)
print("Backward R²:", r2_for)

Backward RMSE: 2.526463107969636
Backward R²: 0.27989781564681515


Linear regression with backward selection

In [17]:
model = LinearRegression()

sbs = SFS(model,
          k_features='best',
          forward=False,
          floating=False,
          scoring='r2',
          cv=5)

sbs.fit(X_train, y_train)

selected_features_backward = list(sbs.k_feature_names_)
print("Selected Features Backward:", selected_features_backward)

Selected Features Backward: ['num_lab_procedures', 'num_procedures', 'num_medications', 'number_diagnoses', 'gender_Male', 'age_[20-30)', 'age_[30-40)', 'age_[40-50)', 'age_[50-60)', 'age_[60-70)', 'age_[70-80)', 'age_[90-100)']


In [18]:
X_train_back = X_train[selected_features_backward]
X_test_back = X_test[selected_features_backward]

model_back = LinearRegression()
model_back.fit(X_train_back, y_train)

y_pred_back = model_back.predict(X_test_back)
rmse_back = np.sqrt(mean_squared_error(y_test, y_pred_back))
r2_back = r2_score(y_test, y_pred_back)

print("Backward RMSE:", rmse_back)
print("Backward R²:", r2_back)

Backward RMSE: 2.5264518124339554
Backward R²: 0.2799042546260413


After runnig linear regressions usign only the forward selected features and the backward selected features, the results were the same.

# PCR

Sacale features

In [19]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [24]:
model_pcr = LinearRegression()
model_pcr.fit(X_train_pca, y_train)

y_pred_pcr = model_pcr.predict(X_test_pca)

rmse_pcr = np.sqrt(mean_squared_error(y_test, y_pred_pcr))
r2_pcr = r2_score(y_test, y_pred_pcr)

print("PCR RMSE:", rmse_pcr)
print("PCR R²:", r2_pcr)

PCR RMSE: 2.5429748870361584
PCR R²: 0.27045455687159325


The results were just slightly worse than my feature selected models.

# PLSR

In [26]:
pls = PLSRegression(n_components=5)
pls.fit(X_train_scaled, y_train)

y_pred_pls = pls.predict(X_test_scaled)

rmse_pls = np.sqrt(mean_squared_error(y_test, y_pred_pls))
r2_pls = r2_score(y_test, y_pred_pls)

print("PLSR RMSE:", rmse_pls)
print("PLSR R²:", r2_pls)

PLSR RMSE: 2.526527276145267
PLSR R²: 0.27986123626428416


In [30]:
results = {
    "Model": ["Forward Selection", "Backward Selection", "PCR", "PLSR"],
    "RMSE": [2.526, 2.526, 2.543, 2.527],
    "R²": [0.280, 0.280, 0.270, 0.280]
}

In [31]:
results_df = pd.DataFrame(results)
print(results_df)

                Model   RMSE    R²
0   Forward Selection  2.526  0.28
1  Backward Selection  2.526  0.28
2                 PCR  2.543  0.27
3                PLSR  2.527  0.28


The results in all cases were very similar. Maybe some feature eniginnering would help improve the R² score and better result.