In [82]:
import pandas as pd
from copy import deepcopy
from scipy import stats
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer

In [35]:
class LinearRegressionWrapper():
    def __init__(self, model_params={'fit_intercept': False}):

        self.model_name = "linear_regression"
        self.search_type = 'grid'
        self.param_grid = {'fit_intercept': [True, False]}
        if model_params is None:
            self.ModelClass = LinearRegression()
        else:
            self.ModelClass = LinearRegression(**model_params)

In [36]:
train_df = pd.read_csv('train_data.csv')
train_df.set_index("date", inplace=True)

train_df.tail()

Unnamed: 0_level_0,SPY,RPI,S&P: indust,HOUSTW,EXJPUSx,WPSID62
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2003-08-31,0.020416,0.001636,0.001452,6.063785,-0.000282,-0.008375
2003-09-30,-0.010803,0.001476,0.033748,6.113682,-0.033091,0.027151
2003-10-31,0.052144,0.005737,0.011946,6.317165,-0.002906,-0.002173
2003-11-30,0.010862,0.005737,0.011946,6.317165,-0.002906,-0.002173
2003-12-31,0.048997,0.00035,0.031326,6.253829,-0.013278,0.032111


In [44]:
target = "SPY"
k = 3
model_wrapper = LinearRegressionWrapper(model_params={'fit_intercept': False})

model = model_wrapper.ModelClass
size_of_fold = len(train_df) // k
features = train_df.columns.difference([target])
score = make_scorer(mean_squared_error)

# split the data into k folds
folds = []
for i in range(k):
    start = i * size_of_fold
    end = (i + 1) * size_of_fold
    if i == k - 1:
        end = len(train_df)
    fold = train_df.iloc[start:end]
    folds.append(fold)

# make k combinations 2 by 2 of the folds where order does not matter
combinations = []
for i in range(k):
    for j in range(i + 1, k):
        combinations.append((folds[i], folds[j]))

# # for each combination, train the model on the first fold and test on the second fold
# pbar = tqdm(total=len(combinations) * len(features))
# for feature in features:
#     print(f"Feature: {feature}")
#     for i, (train_fold, test_fold) in enumerate(combinations):
#         pbar.set_description(f"Feature: {feature} - Fold {i + 1}")
        
#         train_model_fit = model.fit(train_fold.drop(columns="target"), train_fold["target"])


#         # compute the error
#         # print the error

In [60]:
feature = next(iter(features))
i, (train_fold, test_fold) = next(iter(enumerate(combinations)))

# Prepare the features and target for training
X_train = train_fold.drop(columns=target)
X_train_without_feature = train_fold.drop(columns=[target, feature])
y_train = train_fold[target]

# Prepare the features for testing
X_test = test_fold.drop(columns=target)
# X_test_without_feature = test_fold.drop(columns=[target, feature])
y_test = test_fold[target]

model_with_feature = model.fit(X_train, y_train)

In [72]:
feature = next(iter(features))
all_generalization_errors_with_feature = []
all_generalization_errors_without_feature = []
for i, (train_fold, test_fold) in enumerate(combinations):

    model1 = deepcopy(model)
    model2 = deepcopy(model)

    # Prepare the features and target for training
    X_train = train_fold.drop(columns=target)
    X_train_without_feature = train_fold.drop(columns=[target, feature])
    y_train = train_fold[target]

    # Prepare the features for testing
    X_test = test_fold.drop(columns=target)
    X_test_without_feature = test_fold.drop(columns=[target, feature])
    y_test = test_fold[target]

    # Fit the model
    model_with_feature = model1.fit(X_train, y_train)
    model_without_feature = model2.fit(X_train_without_feature, y_train)

    # Predictions for the model
    train_predictions_with_feature = model_with_feature.predict(X_train)
    test_predictions_with_feature = model_with_feature.predict(X_test)
    train_predictions_without_feature = model_without_feature.predict(X_train_without_feature)
    test_predictions_without_feature = model_without_feature.predict(X_test_without_feature)

    # Compute the error
    train_error_with_feature = mean_squared_error(y_train, train_predictions_with_feature)
    test_error_with_feature = mean_squared_error(y_test, test_predictions_with_feature)
    train_error_without_feature = mean_squared_error(y_train, train_predictions_without_feature)
    test_error_without_feature = mean_squared_error(y_test, test_predictions_without_feature)

    # Compute generalization error
    generalization_error_with_feature = test_error_with_feature - train_error_with_feature
    generalization_error_without_feature = test_error_without_feature - train_error_without_feature

    all_generalization_errors_with_feature.append(generalization_error_with_feature)
    all_generalization_errors_without_feature.append(generalization_error_without_feature)

In [83]:
x1 = np.array(all_generalization_errors_with_feature)

x1

array([0.00192232, 0.00365294, 0.00117994])

In [84]:
x2 = np.array(all_generalization_errors_without_feature)

x2

array([-0.00093966,  0.00041431,  0.00107503])

In [91]:
# Compute the difference
diff = x2 - x1

# Perform the one-sided t-test on the difference
t_statistic, p_value = stats.ttest_1samp(diff, 0)

# Since it's one-sided, we halve the p-value and change the sign of the t-statistic
p_value /= 2
t_statistic = -t_statistic if t_statistic < 0 else t_statistic

# If your p_value is less than your alpha (commonly 0.05), 
# and your t_statistic is positive, you can reject the null hypothesis

In [92]:
p_value

0.08564267523749114

In [93]:
t_statistic

2.0940494428365786