In [1]:
# imports 
import numpy as np
import sklearn.datasets as skdata
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd

Output the correlation matrix and select 2 features with highest correlation with the target variable.

In [2]:
# load the iris dataset

iris = skdata.load_iris()


# get the data and target
X = np.asarray(iris.data)
y = np.asarray(iris.target)


# split the data 80 20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

# output a correlation matrix of the features
cm = np.corrcoef(X_train.T, y_train)
cm_with_target = cm[-1][:-1]
top_2_features_indices = np.argsort(cm_with_target)[-2:]
top_2_features = X_train[:, top_2_features_indices]

print(cm)

print("Top 2 features are: ", top_2_features_indices)

# output the top 2 features column 
df = pd.DataFrame
df = pd.DataFrame(data=X_train, columns=iris.feature_names)
df = df.iloc[:, top_2_features_indices]
print(df)



[[ 1.         -0.16947251  0.86302463  0.81180492  0.76981913]
 [-0.16947251  1.         -0.49018343 -0.41966703 -0.47756569]
 [ 0.86302463 -0.49018343  1.          0.96233811  0.94969243]
 [ 0.81180492 -0.41966703  0.96233811  1.          0.95944743]
 [ 0.76981913 -0.47756569  0.94969243  0.95944743  1.        ]]
Top 2 features are:  [2 3]
     petal length (cm)  petal width (cm)
0                  4.6               1.5
1                  5.0               2.0
2                  6.1               2.3
3                  1.4               0.2
4                  5.3               2.3
..                 ...               ...
115                6.9               2.3
116                4.2               1.5
117                5.9               2.3
118                4.0               1.0
119                5.6               2.2

[120 rows x 2 columns]


Train an OLS model (linear regression) using the 2 features and report the Mean Absolute Error (MAE) on the testing dataset

In [3]:
# train an OLS regressor using the top 2 features and MAE 
# on the test set

model = LinearRegression()
model.fit(top_2_features, y_train)
y_pred = model.predict(X_test[:, top_2_features_indices])
mae = mean_absolute_error(y_test, y_pred)

print("MAE: ", mae)



MAE:  0.17212330416759655


Now train a quadratic model by transforming the 2 features in polynomial features and training a linear regression model. Report the Mean Absolute Error (MAE) on the testing dataset.

In [4]:

# transform the features into a polynomial feature space
poly = PolynomialFeatures(2, include_bias=False)
X_train_poly = poly.fit_transform(top_2_features)
X_test_poly = poly.transform(X_test[:,top_2_features_indices])

# train an OLS regressor using the polynomial features and MAE
poly_model = LinearRegression()
# top two features for the training set. 

poly_model.fit(X_train_poly, y_train)
y_pred = poly_model.predict(X_test_poly)
mae = mean_absolute_error(y_test, y_pred)

print("MAE: ", mae)


MAE:  0.17651276620569006
