In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

# Data

In [2]:
# load the data from a filepath
data_filepath = 'data/train.csv'

# create the dataframe
raw_df = pd.read_csv(data_filepath)
display(raw_df.head())

# Split into features and labels (still dataframes)
X_df = raw_df.iloc[:, 2:]
y_df = raw_df.iloc[:, 1]

display(X_df.head())
display(y_df.head())

# Create numpy arrays from the pandas dataframes
#X = X_df.to_numpy()
X = MinMaxScaler().fit_transform(X_df)
y = y_df.to_numpy()

print(X.shape, y.shape)

Unnamed: 0,Id,y,x1,x2,x3,x4,x5
0,0,3.57962,0.02,0.05,-0.09,-0.43,-0.08
1,1,0.734869,-0.13,0.11,-0.08,-0.29,-0.03
2,2,4.287433,0.08,0.06,-0.07,-0.41,-0.03
3,3,8.347827,0.02,-0.12,0.01,-0.43,-0.02
4,4,-0.052718,-0.14,-0.12,-0.08,-0.02,-0.08


Unnamed: 0,x1,x2,x3,x4,x5
0,0.02,0.05,-0.09,-0.43,-0.08
1,-0.13,0.11,-0.08,-0.29,-0.03
2,0.08,0.06,-0.07,-0.41,-0.03
3,0.02,-0.12,0.01,-0.43,-0.02
4,-0.14,-0.12,-0.08,-0.02,-0.08


0    3.579620
1    0.734869
2    4.287433
3    8.347827
4   -0.052718
Name: y, dtype: float64

(700, 5) (700,)


## Feature transformations

* Linear

  $$\phi_1(\mathbf{x}) = x_1,~ \phi_2(\mathbf{x}) = x_2,~ \phi_3(\mathbf{x}) = x_3,~ \phi_4(\mathbf{x}) = x_4,~ \phi_5(\mathbf{x}) = x_5,~$$
  
* Quadratic

  $$\phi_6(\mathbf{x}) = x^2_1,~ \phi_7(\mathbf{x}) = x^2_2,~ \phi_8(\mathbf{x}) = x^2_3,~ \phi_9(\mathbf{x}) = x^2_4,~ \phi_{10}(\mathbf{x}) = x^2_5,~$$

* Exponential

    $$\phi_{11}(\mathbf{x}) = e^{x_1},~ \phi_{12}(\mathbf{x}) = e^{x_2},~ \phi_{13}(\mathbf{x}) = e^{x_3},~ \phi_{14}(\mathbf{x}) = e^{x_4},~ \phi_{15}(\mathbf{x}) = e^{x_5}$$
    
* Cosine

    $$\phi_{16}(\mathbf{x}) = \textrm{cos}(x_1),~ \phi_{17}(\mathbf{x}) = \textrm{cos}(x_2),~ \phi_{18}(\mathbf{x}) = \textrm{cos}(x_3),~ \phi_{19}(\mathbf{x}) = \textrm{cos}(x_4),~ \phi_{20}(\mathbf{x}) = \textrm{cos}(x_5)$$

* Constant

    $$\phi_{21}(\mathbf{x})=1$$

In [3]:
def transform_features(data):
    linear = data.copy()

    quadratic = np.multiply(data, data)

    exponential = np.exp(data)

    cosine = np.cos(data)

    constant = np.ones(700).reshape(700, 1)
    
    assert((linear == data).all())

    return np.hstack((linear, quadratic, exponential, cosine, constant))

In [4]:
X = transform_features(X)

np.set_printoptions(suppress=True)
display(X.shape)
display(X[0, :])

(700, 21)

array([0.56626506, 0.6025641 , 0.48148148, 0.09375   , 0.76190476,
       0.32065612, 0.3630835 , 0.23182442, 0.00878906, 0.58049887,
       1.761675  , 1.82679689, 1.61847036, 1.09828514, 2.14235301,
       0.84391059, 0.8238851 , 0.88630983, 0.99560869, 0.72352247,
       1.        ])

# Linear regression
## Training

In [5]:
reg = LinearRegression().fit(X, y)

In [6]:
display(reg.coef_)
display(reg.score(X, y))

array([-1340.32042514,   -79.35369453,  -426.14899991,   747.2814417 ,
       -1515.35178074, -3292.84009675,   -93.63734621, -1128.35348962,
        1999.90296035, -3494.70398959,  1372.76919139,   103.24976491,
         421.8694849 ,  -752.21326689,  1544.21024437, -4949.96656452,
           2.95127537, -1797.89906342,  3141.79755364, -5123.36742139,
           0.        ])

0.032197598187658105

In [7]:
np.savetxt('submission.csv', reg.coef_, delimiter='\n', fmt="%s")