# Use Lasso to do feature selection first and then do polynomial regression

In [3]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

df = pd.read_csv('winequality-red.csv')

y = df['quality']
X = df.drop(['quality'], axis=1)


print("X shape: {}".format(X.shape))
print("y shape: {}".format(y.shape))

## 1. 建立Lasso模型(假設alpha=0.1)
model = linear_model.Lasso(alpha=0.1)

# Train the model using the training sets
model.fit(X, y)

# The coefficients
print('Coefficients: {}\n'.format(model.coef_))

## 2. 顯示出有多少個係數為 0
print('number of model 0_coef: {}'.format(np.sum(model.coef_ == 0)))
print('number of model coef: {}'.format(np.sum(model.coef_ != 0)))

X shape: (1599, 11)
y shape: (1599,)
Coefficients: [ 0.031408   -0.          0.          0.         -0.          0.00571672
 -0.00377281 -0.         -0.          0.          0.25583985]

number of model 0_coef: 7
number of model coef: 4


In [4]:
print('before feature extraction\nX shape: {}'.format(X.shape))
print(X.head())

before feature extraction
X shape: (1599, 11)
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  
0      9.4  
1      9.8  
2

In [5]:
mask = model.coef_ != 0
print(mask)

## 3. 使用mask將係數為0對應的特徵剔除
X = X.iloc[:,mask] 

print('after feature extraction\nX shape: {}'.format(X.shape))
print(X.head())

[ True False False False False  True  True False False False  True]
after feature extraction
X shape: (1599, 4)
   fixed acidity  free sulfur dioxide  total sulfur dioxide  alcohol
0            7.4                 11.0                  34.0      9.4
1            7.8                 25.0                  67.0      9.8
2            7.8                 15.0                  54.0      9.8
3           11.2                 17.0                  60.0      9.8
4            7.4                 11.0                  34.0      9.4


In [6]:
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures

## 4. 將抽取過後的特徵做PolynomialFeatures提升維度(假設degree=2)
poly = PolynomialFeatures(degree=2).fit(X)
X_poly = poly.transform(X)


X_train,X_test,y_train,y_test=train_test_split(X_poly, y, test_size=0.3, random_state=1) #random_state 種子值


scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)


model = linear_model.LinearRegression()
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)

# The coefficients
print('Coefficients: {}\n'.format(model.coef_))
# The mean squared error
print("Mean squared error: {}".format(mean_squared_error(y_test, y_pred)))
# Explained variance score: 1 is perfect prediction
print('R2 score: {}'.format(r2_score(y_test, y_pred)))

Coefficients: [ 0.          1.12085207 -0.43204477  0.80289019  0.24925916 -0.64598173
  0.02024509 -0.63958558 -0.2003944  -0.1239072  -0.07734033  0.72422259
  0.16450726 -0.46554536  0.17511712]

Mean squared error: 0.4712614099778648
R2 score: 0.22571593400246703
