In [1]:
import pool_alice_code
import copy
import pandas as pd
import numpy as np
from scipy import stats

# Load Data

In [2]:
X = np.load('../data/abalone_age/X.npy')
X = pd.DataFrame(X,columns='Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight'.split(','))
y = np.load('../data/abalone_age/y.npy')
y = pd.DataFrame(y)
y = y[0]
X

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight
0,0.0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500
1,0.0,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700
2,1.0,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100
3,0.0,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550
4,2.0,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550
...,...,...,...,...,...,...,...,...
4172,1.0,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490
4173,0.0,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605
4174,0.0,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080
4175,1.0,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960


# exploring higher order features and nonlinear transformations for nonlinear regression

check the following to see if they have significant correlation with the explanatory variables, using spearman correlation correlation
- squared features: $(X_0)^2$
- cubic features: $(X_0)^3$
- first-order interactions. for example: $X_0 X_1, X_1 X_2$

In [3]:
num_original_features = len(X.iloc[0]) # first_row = df.iloc[0]
print(num_original_features)

X_extra = copy.deepcopy(X)

8


In [4]:
X_extra.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight
0,0.0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,0.0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,1.0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,0.0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,2.0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055


In [5]:
for i in range(num_original_features):
    feature_i_name = X_extra.columns[i]

    # add squared feature
    feature_i_name_squared = feature_i_name + '^2'
    X_extra[feature_i_name_squared] = X_extra[feature_i_name]**2

    # add cubed feature
    feature_i_name_cubed = feature_i_name + '^3'
    X_extra[feature_i_name_cubed] = X_extra[feature_i_name]**3

    # add first order interactions
    for j in range(i,num_original_features): # ranging from i to num_original_features prevents X1*X2 and X2*X1
        if i != j:
            feature_j_name = X_extra.columns[j]
            interaction_name = feature_i_name + '*' + feature_j_name
            X_extra[interaction_name] = X_extra[feature_i_name] * X_extra[feature_j_name]


In [6]:
X_extra.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Sex^2,Sex^3,...,Whole_weight*Shell_weight,Shucked_weight^2,Shucked_weight^3,Shucked_weight*Viscera_weight,Shucked_weight*Shell_weight,Viscera_weight^2,Viscera_weight^3,Viscera_weight*Shell_weight,Shell_weight^2,Shell_weight^3
0,0.0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0.0,0.0,...,0.0771,0.0504,0.011315,0.022675,0.033675,0.010201,0.00103,0.01515,0.0225,0.003375
1,0.0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0.0,0.0,...,0.015785,0.0099,0.000985,0.004826,0.006965,0.002352,0.000114,0.003395,0.0049,0.000343
2,1.0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,1.0,1.0,...,0.14217,0.065792,0.016876,0.036295,0.053865,0.020022,0.002833,0.029715,0.0441,0.009261
3,0.0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,0.0,0.0,...,0.07998,0.04644,0.010008,0.024567,0.033403,0.012996,0.001482,0.01767,0.024025,0.003724
4,2.0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,4.0,8.0,...,0.011275,0.00801,0.000717,0.003535,0.004922,0.00156,6.2e-05,0.002172,0.003025,0.000166


In [7]:
X_extra.shape

(4177, 52)

In [8]:
X_extra['target'] = y

In [9]:
X_extra.shape

(4177, 53)

In [10]:
# result is two symmetric matrices, where
# matrix_1[i][j] refers to the spearman coefficient for the ith feature against the jth feature. we are only interested in the last column (which is the transpose of the last row), which is the spearman coefficient between the target and each feature (since we added the target to the dataframe, above)
res = stats.spearmanr(a=X_extra)

In [11]:
res = np.array(res)

In [12]:
res.shape

(2, 53, 53)

In [13]:
# Spearman correlation matrix or correlation coefficient (if only 2 variables are given as parameters). Correlation matrix is square with length equal to total number of variables (columns or rows) in a and b combined. i.e. the spearman test statistic 
spearmean_coefs = res[0,:,:] 
# The p-value for a hypothesis test whose null hypothesis is that two samples have no ordinal correlation. See alternative above for alternative hypotheses. pvalue has the same shape as statistic.
p_vals = res[1,:,:]

print(f"spearmean_coefs.shape={spearmean_coefs.shape}, p_vals={p_vals.shape}")

spearmean_coefs.shape=(53, 53), p_vals=(53, 53)


In [14]:
def check_symmetric(a, rtol=1e-05, atol=1e-08):
    # https://stackoverflow.com/questions/42908334/checking-if-a-matrix-is-symmetric-in-numpy
    return np.allclose(a, a.T, rtol=rtol, atol=atol)

def check_diagonals_zero(a, rtol=1e-05, atol=1e-08):
    return np.allclose(np.diag(a),0,rtol=rtol,atol=atol)

print(check_symmetric(p_vals))
print(check_symmetric(spearmean_coefs))
if check_diagonals_zero(p_vals) == True:
    print("All features spearman coefficients against themselves checks out wrt to p-val") # all features are exactly correlated with themselves, so we can confidently reject the null hypothesis (see above) 
else:
    print("All features spearman coefficients against themselves DO NOT CHECK OUT wrt to p-val. something is wrong") 

print(check_diagonals_zero(spearmean_coefs))

True
True
All features spearman coefficients against themselves checks out wrt to p-val
False


In [15]:
last_row_idx = len(p_vals)-1
p_vals_y_vs_all = p_vals[last_row_idx,:]
print(last_row_idx)
p_vals_y_vs_all.shape

52


(53,)

In [16]:
p_val_dict = {}

for feature_idx, p_val_y in enumerate(p_vals_y_vs_all):
    feature = X_extra.columns[feature_idx]
    if p_val_y < 0.001:
        p_val_dict[0.001] = p_val_dict.get(0.001,[]) + [(feature, p_val_y)]
    elif p_val_y < 0.01:
        p_val_dict[0.01] = p_val_dict.get(0.01,[]) + [(feature, p_val_y)]
    elif p_val_y < 0.05:
        p_val_dict[0.05] = p_val_dict.get(0.05,[]) + [(feature, p_val_y)]
    else:
        p_val_dict['big'] = p_val_dict.get('big',[]) + [(feature, p_val_y)]

In [17]:
p_val_dict

{0.001: [('Sex', 3.756295331246718e-167),
  ('Length', 0.0),
  ('Diameter', 0.0),
  ('Height', 0.0),
  ('Whole_weight', 0.0),
  ('Shucked_weight', 0.0),
  ('Viscera_weight', 0.0),
  ('Shell_weight', 0.0),
  ('Sex^2', 3.756295331246718e-167),
  ('Sex^3', 3.756295331246718e-167),
  ('Sex*Length', 2.9134442131416297e-44),
  ('Sex*Diameter', 4.141691172168033e-35),
  ('Sex*Height', 2.797776491102081e-23),
  ('Sex*Shell_weight', 4.6265674447400325e-05),
  ('Length^2', 0.0),
  ('Length^3', 0.0),
  ('Length*Diameter', 0.0),
  ('Length*Height', 0.0),
  ('Length*Whole_weight', 0.0),
  ('Length*Shucked_weight', 0.0),
  ('Length*Viscera_weight', 0.0),
  ('Length*Shell_weight', 0.0),
  ('Diameter^2', 0.0),
  ('Diameter^3', 0.0),
  ('Diameter*Height', 0.0),
  ('Diameter*Whole_weight', 0.0),
  ('Diameter*Shucked_weight', 0.0),
  ('Diameter*Viscera_weight', 0.0),
  ('Diameter*Shell_weight', 0.0),
  ('Height^2', 0.0),
  ('Height^3', 0.0),
  ('Height*Whole_weight', 0.0),
  ('Height*Shucked_weight', 0.0

Based on the above, it seems like all these features are worth trying to see curvature and first-order interactions in the data, so let's give it a shot. This seems highly unlikely, but is worth trying.

In [18]:
# X_extra_numpy = X_extra.to_numpy()
# np.save('./X_abalone_pretransformation_1.npy',X_extra_numpy)

In [19]:
basis_funcs = []
X_pretransformation_nonlinear_1 = copy.deepcopy(X)

for i in range(num_original_features):
    basis_funcs.append(pool_alice_code.identity_basis)

for i in range(num_original_features):
    feature_i_name = X.columns[i]

    # add squared feature
    feature_i_name_squared = feature_i_name + '^2'
    X_pretransformation_nonlinear_1[feature_i_name_squared] = X[feature_i_name]
    basis_funcs.append(pool_alice_code.squared_basis)

    # add cubed feature
    feature_i_name_cubed = feature_i_name + '^3'
    X_pretransformation_nonlinear_1[feature_i_name_cubed] = X[feature_i_name]
    basis_funcs.append(pool_alice_code.cubed_basis)

    # add first order interactions
    for j in range(i,num_original_features): # ranging from i to num_original_features prevents X1*X2 and X2*X1
        if i != j:
            feature_j_name = X.columns[j]
            interaction_name = feature_i_name + '*' + feature_j_name
            X_pretransformation_nonlinear_1[interaction_name] = X[feature_i_name] * X[feature_j_name]
            basis_funcs.append(pool_alice_code.identity_basis)


In [20]:
X_extra.columns

Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight',
       'Viscera_weight', 'Shell_weight', 'Sex^2', 'Sex^3', 'Sex*Length',
       'Sex*Diameter', 'Sex*Height', 'Sex*Whole_weight', 'Sex*Shucked_weight',
       'Sex*Viscera_weight', 'Sex*Shell_weight', 'Length^2', 'Length^3',
       'Length*Diameter', 'Length*Height', 'Length*Whole_weight',
       'Length*Shucked_weight', 'Length*Viscera_weight', 'Length*Shell_weight',
       'Diameter^2', 'Diameter^3', 'Diameter*Height', 'Diameter*Whole_weight',
       'Diameter*Shucked_weight', 'Diameter*Viscera_weight',
       'Diameter*Shell_weight', 'Height^2', 'Height^3', 'Height*Whole_weight',
       'Height*Shucked_weight', 'Height*Viscera_weight', 'Height*Shell_weight',
       'Whole_weight^2', 'Whole_weight^3', 'Whole_weight*Shucked_weight',
       'Whole_weight*Viscera_weight', 'Whole_weight*Shell_weight',
       'Shucked_weight^2', 'Shucked_weight^3', 'Shucked_weight*Viscera_weight',
       'Shucked_weight*Shell_

In [21]:
try: 
    X_extra = X_extra.drop('target',axis=1)
    print("successfully removed target column")
except:
    print("already removed 'target' from the dataframe X_extra")

successfully removed target column


In [22]:
X_pretransformation_nonlinear_1.columns == X_extra.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [23]:
print(len(basis_funcs))
print(X_pretransformation_nonlinear_1.shape)

52
(4177, 52)


In [24]:
X_pretransformation_nonlinear_1.columns

Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight',
       'Viscera_weight', 'Shell_weight', 'Sex^2', 'Sex^3', 'Sex*Length',
       'Sex*Diameter', 'Sex*Height', 'Sex*Whole_weight', 'Sex*Shucked_weight',
       'Sex*Viscera_weight', 'Sex*Shell_weight', 'Length^2', 'Length^3',
       'Length*Diameter', 'Length*Height', 'Length*Whole_weight',
       'Length*Shucked_weight', 'Length*Viscera_weight', 'Length*Shell_weight',
       'Diameter^2', 'Diameter^3', 'Diameter*Height', 'Diameter*Whole_weight',
       'Diameter*Shucked_weight', 'Diameter*Viscera_weight',
       'Diameter*Shell_weight', 'Height^2', 'Height^3', 'Height*Whole_weight',
       'Height*Shucked_weight', 'Height*Viscera_weight', 'Height*Shell_weight',
       'Whole_weight^2', 'Whole_weight^3', 'Whole_weight*Shucked_weight',
       'Whole_weight*Viscera_weight', 'Whole_weight*Shell_weight',
       'Shucked_weight^2', 'Shucked_weight^3', 'Shucked_weight*Viscera_weight',
       'Shucked_weight*Shell_

In [25]:
X_pretransformation_nonlinear_1.to_numpy()

array([[0.       , 0.455    , 0.365    , ..., 0.01515  , 0.15     ,
        0.15     ],
       [0.       , 0.35     , 0.265    , ..., 0.003395 , 0.07     ,
        0.07     ],
       [1.       , 0.53     , 0.42     , ..., 0.029715 , 0.21     ,
        0.21     ],
       ...,
       [0.       , 0.6      , 0.475    , ..., 0.08855  , 0.308    ,
        0.308    ],
       [1.       , 0.625    , 0.485    , ..., 0.077256 , 0.296    ,
        0.296    ],
       [0.       , 0.71     , 0.555    , ..., 0.1863675, 0.495    ,
        0.495    ]])

In [26]:
X_transformed_1 = pool_alice_code.apply_basis_funcs(X_pretransformation_nonlinear_1.to_numpy(),basis_funcs)

In [27]:
print(X_extra.shape)
print(X_transformed_1.shape)

(4177, 52)
(4177, 52)


In [28]:
pd.DataFrame(X_transformed_1,columns=X_extra.columns).head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Sex^2,Sex^3,...,Whole_weight*Shell_weight,Shucked_weight^2,Shucked_weight^3,Shucked_weight*Viscera_weight,Shucked_weight*Shell_weight,Viscera_weight^2,Viscera_weight^3,Viscera_weight*Shell_weight,Shell_weight^2,Shell_weight^3
0,0.0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0.0,0.0,...,0.0771,0.0504,0.011315,0.022675,0.033675,0.010201,0.00103,0.01515,0.0225,0.003375
1,0.0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0.0,0.0,...,0.015785,0.0099,0.000985,0.004826,0.006965,0.002352,0.000114,0.003395,0.0049,0.000343
2,1.0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,1.0,1.0,...,0.14217,0.065792,0.016876,0.036295,0.053865,0.020022,0.002833,0.029715,0.0441,0.009261
3,0.0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,0.0,0.0,...,0.07998,0.04644,0.010008,0.024567,0.033403,0.012996,0.001482,0.01767,0.024025,0.003724
4,2.0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,4.0,8.0,...,0.011275,0.00801,0.000717,0.003535,0.004922,0.00156,6.2e-05,0.002172,0.003025,0.000166


In [29]:
X_extra.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Sex^2,Sex^3,...,Whole_weight*Shell_weight,Shucked_weight^2,Shucked_weight^3,Shucked_weight*Viscera_weight,Shucked_weight*Shell_weight,Viscera_weight^2,Viscera_weight^3,Viscera_weight*Shell_weight,Shell_weight^2,Shell_weight^3
0,0.0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0.0,0.0,...,0.0771,0.0504,0.011315,0.022675,0.033675,0.010201,0.00103,0.01515,0.0225,0.003375
1,0.0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0.0,0.0,...,0.015785,0.0099,0.000985,0.004826,0.006965,0.002352,0.000114,0.003395,0.0049,0.000343
2,1.0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,1.0,1.0,...,0.14217,0.065792,0.016876,0.036295,0.053865,0.020022,0.002833,0.029715,0.0441,0.009261
3,0.0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,0.0,0.0,...,0.07998,0.04644,0.010008,0.024567,0.033403,0.012996,0.001482,0.01767,0.024025,0.003724
4,2.0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,4.0,8.0,...,0.011275,0.00801,0.000717,0.003535,0.004922,0.00156,6.2e-05,0.002172,0.003025,0.000166


In [30]:
for row_idx, X_extra_row in enumerate(X_extra.to_numpy()):

    # raise Exception(f'X_extra_row={X_extra_row}\nX_transformed_1[row_idx,:]={X_transformed_1[row_idx,:]}, np.allclose(X_extra_row,X_transformed_1[row_idx,:])={X_extra_row == X_transformed_1[row_idx,:]}')
    if not np.allclose(X_extra_row,X_transformed_1[row_idx,:]):
        raise Exception(f"row {row_idx} is different than expected")
    
print('all the rows matched as expected!')

all the rows matched as expected!
