# EXERCISE 1

Use all feature selection methods to find the best features

### DATASET INFORMATION
### FEATURES

Number of Instances: 20640

Number of Attributes: 8 numeric, predictive attributes and the target

Attribute Information:

MedInc - median income in block group

HouseAge - median house age in block group

AveRooms - average number of rooms per household

AveBedrms - average number of bedrooms per household

Population - block group population

AveOccup - average number of household members

Latitude - block group latitude

Longitude - block group longitude

### TARGET

The target variable is the median house value for California districts, expressed in hundreds of thousands of dollars ($100,000).

In [263]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

In [264]:
housing = fetch_california_housing(as_frame=True)
df = pd.concat([housing.data, housing.target], axis=1)

In [265]:
df_housing_features =  pd.DataFrame(housing.data, columns=housing.feature_names)
df_housing_target = pd.DataFrame(housing.target, columns=['MedHouseVal'])

In [266]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_housing_features = pd.DataFrame(scaler.fit_transform(df_housing_features), columns=df_housing_features.columns)

In [267]:
df.corr().head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.688075
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,0.105623
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.151948
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.046701
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,-0.02465


In [268]:
df.corr()['MedHouseVal'].abs().sort_values(ascending=False)

MedHouseVal    1.000000
MedInc         0.688075
AveRooms       0.151948
Latitude       0.144160
HouseAge       0.105623
AveBedrms      0.046701
Longitude      0.045967
Population     0.024650
AveOccup       0.023737
Name: MedHouseVal, dtype: float64

### 1. Use any filter method to select the best features


In [269]:
from sklearn.feature_selection import f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE

threshold = 4
high_score_features = []
feature_scores = f_regression(df_housing_features, df_housing_target.values.ravel())[0]

In [270]:
for score, f_name in sorted(zip(feature_scores, df_housing_features.columns), reverse=True)[:threshold]:
      high_score_features.append(f_name)

df_housing_filter = df_housing_features[high_score_features]
df_housing_filter.columns

Index(['MedInc', 'AveRooms', 'Latitude', 'HouseAge'], dtype='object')

In [271]:
X_train_filter, X_test_filter, y_train_filter, y_test_filter = train_test_split(df_housing_filter, df_housing_target, test_size=0.2, random_state=42)

model_filter = RandomForestRegressor(n_estimators=500, random_state=0, max_depth = 3)
model_filter.fit(X_train_filter, y_train_filter)

  return fit_method(estimator, *args, **kwargs)


### 2. Use any wrapper method to select the best features

In [272]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE

model_wrapper = RandomForestRegressor(n_estimators=500, random_state=0, max_depth = 3)
selector = RFE(model_wrapper, n_features_to_select=4, step=1)

selector = selector.fit(df_housing_features, df_housing_target.values.ravel())
selector_ind = selector.get_support()
df_housing_wrapper = df_housing_features.iloc[:, selector_ind]
df_housing_wrapper.columns
     

Index(['MedInc', 'AveRooms', 'AveOccup', 'Latitude'], dtype='object')

In [273]:
X_train_wrapper, X_test_wrapper, y_train_wrapper, y_test_wrapper = train_test_split(df_housing_wrapper, df_housing_target, test_size=0.2, random_state=42)

model_wrapper = RandomForestRegressor(n_estimators=500, random_state=0, max_depth = 3)
model_wrapper.fit(X_train_wrapper, y_train_wrapper)

  return fit_method(estimator, *args, **kwargs)


### 3. Use any embedded methood to select the best features

In [274]:
from sklearn.model_selection import train_test_split

X_train_embedded, X_test_embedded, y_train_embedded, y_test_embedded = train_test_split(df_housing_features, df_housing_target, test_size=0.2, random_state=42)

In [281]:
from sklearn.feature_selection import SelectFromModel

model_embedded = RandomForestRegressor(n_estimators=500, random_state=0, max_depth = 3)
model_embedded.fit(X_train_embedded, y_train_embedded.values.ravel())

model_embedded.predict

sel_sfm = SelectFromModel(model_embedded, prefit=True)
sel_sfm_index = sel_sfm.get_support()
df_housing_embedded = df_housing_features.iloc[:, sel_sfm_index]
df_housing_embedded.columns

     

Index(['MedInc', 'AveOccup'], dtype='object')

### MODEL COMPARISON

In [282]:
from sklearn.metrics import mean_squared_error, r2_score


In [283]:
fr_preds = model_filter.predict(X_test_filter)
rfe_preds = model_wrapper.predict(X_test_wrapper)
sfm_preds = model_embedded.predict(X_test_embedded)

In [284]:
fr_rmse = mean_squared_error(y_test, fr_preds, squared=False)
rfe_rmse = mean_squared_error(y_test, rfe_preds, squared=False)
sfm_rmse = mean_squared_error(y_test, sfm_preds, squared=False)



In [285]:
print(f'Filter: {fr_rmse * 100} %')
print(f'Wrapper: {rfe_rmse * 100} %')
print(f'Embedded: {sfm_rmse* 100} %')

Filter: 80.97992684930642 %
Wrapper: 77.53133701765597 %
Embedded: 77.50446080121174 %
