In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from mlxtend.feature_selection import SequentialFeatureSelector
import matplotlib.pyplot as plt
import category_encoders as ce

df_dataset = pd.read_csv('Family Income and Expenditure.csv')

# Initial features to use
features = [
    "Housing and water Expenditure",
    "Total Household Income",
    "Region",
    "Agricultural Household indicator",
    "Imputed House Rental Value",
    "Total Income from Entrepreneurial Acitivites",
    "Total Number of Family members",
    "Members with age less than 5 year old",
    "Members with age 5 - 17 years old",
    "Total number of family members employed",
    "Type of Building/House",
    "Type of Roof",
    "Type of Walls",
    "House Floor Area",
    "House Age",
    "Number of bedrooms",
    "Electricity",
    "Main Source of Water Supply",
    "Number of Television",
    "Number of CD/VCD/DVD",
    "Number of Component/Stereo set",
    "Number of Refrigerator/Freezer",
    "Number of Washing Machine",
    "Number of Airconditioner",
    "Number of Personal Computer"
]

df_temp = df_dataset.filter(features, axis=1)

df_temp['Number of Children'] = df_temp['Members with age less than 5 year old'] + df_temp['Members with age 5 - 17 years old']

df_temp = df_temp.drop(['Members with age less than 5 year old', 'Members with age 5 - 17 years old'], axis=1)

# Define the categorical columns
cat_cols = ['Region', 'Type of Building/House', 'Type of Roof', 'Type of Walls', 'Main Source of Water Supply']

# Create an instance of the TargetEncoder
encoder = ce.TargetEncoder(cols=cat_cols)

# Fit and transform the target encoder on the dataframe
df_encoded = encoder.fit_transform(df_temp, df_temp['Housing and water Expenditure'])

# Replace the original categorical columns with the target-encoded values
df_temp[cat_cols] = df_encoded[cat_cols]

X = df_temp.drop(['Housing and water Expenditure'], axis=1)
y = df_temp['Housing and water Expenditure']

# Correlation Matrix
threshold = 0.9

corr_matrix = X.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
upper.head()

# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

if len(to_drop) > 0:
    print(f'Columns to drop: {to_drop}')
    X = X.drop(to_drop, axis=1)
else:
    print('No columns to drop. The features are not highly correlated.')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

# Forward feature selection
regressor = LinearRegression()
sfs = SequentialFeatureSelector(regressor, k_features='best', forward=True, scoring='neg_mean_squared_error', cv=5)
sfs.fit(X_train, y_train)

selected_features = list(X_train.columns[list(sfs.k_feature_idx_)])
print(f'Selected Features: {selected_features}')

# Drop features that are not selected
to_drop = [feature for feature in X.columns if feature not in selected_features]
X = X.drop(to_drop, axis=1)
print(f'Dropped features: {to_drop}')

# POLYNOMIAL
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_interactions = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_interactions, y, test_size=0.30, random_state=1)

regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
print(f'R2: {r2_score(y_test, y_pred)}')

No columns to drop. The features are not highly correlated.
Selected Features: ['Total Household Income', 'Region', 'Imputed House Rental Value', 'Total Number of Family members', 'Type of Building/House', 'House Floor Area', 'House Age', 'Number of bedrooms', 'Main Source of Water Supply', 'Number of CD/VCD/DVD', 'Number of Component/Stereo set', 'Number of Refrigerator/Freezer', 'Number of Washing Machine', 'Number of Airconditioner', 'Number of Personal Computer']
Dropped features: ['Agricultural Household indicator', 'Total Income from Entrepreneurial Acitivites', 'Total number of family members employed', 'Type of Roof', 'Type of Walls', 'Electricity', 'Number of Television', 'Number of Children']
RMSE: 16508.000514084146
R2: 0.9035775048495079
