In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/d2cml-ai/CausalAI-Course/main/data/wage2015_subsample_inference.csv")

7.

In [3]:
y = data["lwage"].to_numpy().reshape(-1, 1)

8

8.1

In [4]:
basic_predictors = ["sex", "hsg", "scl", "clg", "ad", "so", "we", "ne", "exp1", "occ2", "ind2"]
X_basic = pd.get_dummies(data[basic_predictors], columns = ["occ2", "ind2"], drop_first = True, dtype = int)

8.2

In [5]:
flexible_predictors = basic_predictors + ["exp2", "exp3", "exp4"]
X_flexible = pd.get_dummies(data[flexible_predictors], columns = ["occ2", "ind2"], drop_first = True)
indicator_variables = [variable for variable in X_flexible.columns if any(base_name in variable for base_name in ["hsg", "scl", "clg", "ad", "so", "we", "ne", "occ2", "ind2"])]

for experience_variable in ["exp1", "exp2", "exp3", "exp4"]:
        for indicator_variable in indicator_variables:
                temp_frame = pd.DataFrame({
                        f"{experience_variable}_{indicator_variable}": X_flexible[experience_variable] * X_flexible[indicator_variable]
                })
                X_flexible = pd.concat((X_flexible, temp_frame), axis = 1)

8.3

In [6]:
X_extra_flexible = X_basic.copy()

In [7]:
X_extra_flexible = pd.concat((X_extra_flexible, data[["exp2", "exp3", "exp4"]]), axis = 1)

In [8]:
extra_flexible_base_features = X_extra_flexible.columns[1:]

In [9]:
base_experience_variables = ["exp1", "exp2", "exp3"]
exp_4 = ["exp4"]
education_variables = ["hsg", "scl", "clg", "ad"]
location_variables = ["so", "we", "ne"]
occupation_variables = [variable for variable in X_extra_flexible.columns if "occ2" in variable]
industry_variables = [variable for variable in X_extra_flexible.columns if "ind2" in variable]
variable_groups = [base_experience_variables, exp_4, education_variables, location_variables, occupation_variables, industry_variables]

In [10]:
for variable_group_index, variable_group in enumerate(variable_groups, 1):
        if variable_group == len(variable_groups):
                break

        other_variable_groups = variable_groups[variable_group_index:]

        for variable in variable_group:
                for other_variables in other_variable_groups:
                        temp = X_extra_flexible[other_variables].multiply(X_extra_flexible[variable], axis = "index")
                        temp = temp.add_prefix(f"{variable}")
                        X_extra_flexible = pd.concat(
                                (X_extra_flexible, temp),
                                axis = 1
                        )

In [11]:
temp = pd.DataFrame({"exp8": X_extra_flexible["exp4"] ** 2})
X_extra_flexible = pd.concat((X_extra_flexible, temp), axis = 1)

9.

In [12]:
from sklearn.model_selection import train_test_split

X_basic_train, X_basic_test, y_basic_train, y_basic_test = train_test_split(X_basic, y, train_size = .8, shuffle = True)

In [13]:

X_flexible_train, X_flexible_test, y_flexible_train, y_flexible_test = train_test_split(X_flexible, y, train_size = .8, shuffle = True)

In [14]:

X_extra_flexible_train, X_extra_flexible_test, y_extra_flexible_train, y_extra_flexible_test = train_test_split(X_extra_flexible, y, train_size = .8, shuffle = True)

10.

In [15]:
from sklearn.linear_model import LinearRegression
basic_model = LinearRegression()
basic_model.fit(X_basic_train, y_basic_train)

In [16]:
flexible_model = LinearRegression()
flexible_model.fit(X_flexible_train, y_flexible_train)

In [17]:
extra_flexible_model = LinearRegression()
extra_flexible_model.fit(X_extra_flexible_train, y_extra_flexible_train)

11.

In [18]:
basic_training_mse = ((y_basic_train - basic_model.predict(X_basic_train)) ** 2).mean()
basic_training_R2 = 1 - basic_training_mse / ((y_basic_train - y_basic_train.mean()) ** 2).mean()
basic_training_adjR2 = 1 - basic_training_mse * y_basic_train.shape[0] / (y_basic_train.shape[0] - basic_model.n_features_in_) / ((y_basic_train - y_basic_train.mean()) ** 2).mean()

print(f"Mean square error of basic model: {basic_training_mse: .4f}")
print(f"R2 of basic model: {basic_training_R2: .4f}")
print(f"Adjusted R2 of basic model: {basic_training_adjR2: .4f}")

Mean square error of basic model:  0.2297
R2 of basic model:  0.2942
Adjusted R2 of basic model:  0.2873


In [19]:
basic_testing_mse = ((y_basic_test - basic_model.predict(X_basic_test)) ** 2).mean()
basic_testing_R2 = 1 - basic_testing_mse / ((y_basic_test - y_basic_test.mean()) ** 2).mean()

print(f"Mean square error of basic model: {basic_testing_mse: .4f}")
print(f"R2 of basic model: {basic_testing_R2: .4f}")

Mean square error of basic model:  0.2067
R2 of basic model:  0.3622


In [20]:
flexible_training_mse = ((y_flexible_train - flexible_model.predict(X_flexible_train)) ** 2).mean()
flexible_training_R2 = 1 - flexible_training_mse / ((y_flexible_train - y_flexible_train.mean()) ** 2).mean()
flexible_training_adjR2 = 1 - flexible_training_mse * y_flexible_train.shape[0] / (y_flexible_train.shape[0] - flexible_model.n_features_in_) / ((y_flexible_train - y_flexible_train.mean()) ** 2).mean()

print(f"Mean square error of flexible model: {flexible_training_mse: .4f}")
print(f"R2 of flexible model: {flexible_training_R2: .4f}")
print(f"Adjusted R2 of flexible model: {flexible_training_adjR2: .4f}")

Mean square error of flexible model:  0.2046
R2 of flexible model:  0.3635
Adjusted R2 of flexible model:  0.3317


In [21]:
flexible_testing_mse = ((y_flexible_test - flexible_model.predict(X_flexible_test)) ** 2).mean()
flexible_testing_R2 = 1 - flexible_testing_mse / ((y_flexible_test - y_flexible_test.mean()) ** 2).mean()
flexible_testing_adjR2 = 1 - flexible_testing_mse * y.shape[0] / (y.shape[0] - flexible_model.n_features_in_) / ((y_flexible_test - y_flexible_test.mean()) ** 2).mean()

print(f"Mean square error of flexible model: {flexible_testing_mse: .4f}")
print(f"R2 of flexible model: {flexible_testing_R2: .4f}")
print(f"Adjusted R2 of flexible model: {flexible_testing_adjR2: .4f}")

Mean square error of flexible model:  0.2851
R2 of flexible model:  0.1625
Adjusted R2 of flexible model:  0.1207


In [22]:
extra_flexible_training_mse = ((y_extra_flexible_train - extra_flexible_model.predict(X_extra_flexible_train)) ** 2).mean()
extra_flexible_training_R2 = 1 - extra_flexible_training_mse / ((y_extra_flexible_train - y_extra_flexible_train.mean()) ** 2).mean()
extra_flexible_training_adjR2 = 1 - extra_flexible_training_mse * y_extra_flexible_train.shape[0] / (y_extra_flexible_train.shape[0] - extra_flexible_model.n_features_in_) / ((y_extra_flexible_train - y_extra_flexible_train.mean()) ** 2).mean()

print(f"Mean square error of extra flexible model: {extra_flexible_training_mse: .4f}")
print(f"R2 of extra flexible model: {extra_flexible_training_R2: .4f}")
print(f"Adjusted R2 of extra flexible model: {extra_flexible_training_adjR2: .4f}")

Mean square error of extra flexible model:  0.1745
R2 of extra flexible model:  0.4687
Adjusted R2 of extra flexible model:  0.3458


In [23]:
extra_flexible_testing_mse = ((y_extra_flexible_test - extra_flexible_model.predict(X_extra_flexible_test)) ** 2).mean()
extra_flexible_testing_R2 = 1 - extra_flexible_testing_mse / ((y_extra_flexible_test - y_extra_flexible_test.mean()) ** 2).mean()

print(f"Mean square error of extra flexible model: {extra_flexible_testing_mse: .4f}")
print(f"R2 of extra flexible model: {extra_flexible_testing_R2: .4f}")

Mean square error of extra flexible model:  0.3153
R2 of extra flexible model: -0.0106
