In [1]:
# imports 
import numpy as np
import sklearn.datasets as skdata
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

Output the correlation matrix and select 2 features with highest correlation with the target variable.

In [20]:
# load the iris dataset

iris = skdata.load_iris()


# get the data and target
X = np.asarray(iris.data)
y = np.asarray(iris.target)


# split the data 80 20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# output a correlation matrix of the features
cm = np.corrcoef(X_train.T, y_train)
cm_with_target = cm[-1][:-1]
top_2_features_indices = np.argsort(cm_with_target)[-2:]
top_2_features = X_train[:, top_2_features_indices]

print(cm)

print("Top 2 features are: ", top_2_features_indices)


[[ 1.         -0.08683167  0.88888956  0.84167128  0.82065235]
 [-0.08683167  1.         -0.3750757  -0.32000041 -0.37861688]
 [ 0.88888956 -0.3750757   1.          0.9649918   0.95470773]
 [ 0.84167128 -0.32000041  0.9649918   1.          0.96117848]
 [ 0.82065235 -0.37861688  0.95470773  0.96117848  1.        ]]
Top 2 features are:  [2 3]


Train an OLS model (linear regression) using the 2 features and report the Mean Absolute Error (MAE) on the testing dataset

In [21]:
# train an OLS regressor using the top 2 features and MAE 
# on the test set

model = LinearRegression()
model.fit(top_2_features, y_train)
y_pred = model.predict(X_test[:, top_2_features_indices])
mae = mean_absolute_error(y_test, y_pred)

print("MAE: ", mae)



MAE:  0.21690380886600008


Now train a quadratic model by transforming the 2 features in polynomial features and training a linear regression model. Report the Mean Absolute Error (MAE) on the testing dataset.

In [24]:

# transform the features into a polynomial feature space
poly = PolynomialFeatures(2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# train an OLS regressor using the polynomial features and MAE
model = LinearRegression()
model.fit(X_train_poly, y_train)
y_pred = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, y_pred)

print("MAE: ", mae)


MAE:  0.19331155050293164
