In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

pokemon_df = pd.read_csv("../../data/pokemon.csv")

ansur_male = pd.read_csv("../../data/ANSUR_II_MALE.csv")
ansur_female = pd.read_csv("../../data/ANSUR_II_FEMALE.csv")

diabetes_df = pd.read_csv('../../data/PimaIndians.csv')

## Building a diabetes classifier
### You'll be using the Pima Indians diabetes dataset to predict whether a person has diabetes using logistic regression. There are 8 features and one target in this dataset. The data has been split into a training and test set and pre-loaded for you as X_train, y_train, X_test, and y_test.

### A StandardScaler() instance has been predefined as scaler and a LogisticRegression() one as lr.

### Instructions
-    Fit the scaler on the training features and transform these features in one go.
-    Fit the logistic regression model on the scaled training data.
-    Scale the test features.
-    Predict diabetes presence on the scaled test set.

In [2]:
predictors_vars = ['pregnant','glucose','diastolic','triceps','insulin','bmi','family','age']
target_var = ['test']

X = diabetes_df[predictors_vars]
y = diabetes_df[target_var]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

scaler = StandardScaler()
lr = LogisticRegression()

In [3]:
# Fit the scaler on the training features and transform these in one go
X_train_std = scaler.fit_transform(X_train)

# Fit the logistic regression model on the scaled training data
lr.fit(X_train_std, y_train)

# Scale the test features
X_test_std = scaler.transform(X_test)

# Predict diabetes presence on the scaled test set
y_pred = lr.predict(X_test_std)

# Prints accuracy metrics and feature coefficients
print(f"{accuracy_score(y_test, y_pred):.1%} accuracy on test set.")
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

77.1% accuracy on test set.
{'pregnant': 0.28, 'glucose': 1.06, 'diastolic': 0.05, 'triceps': 0.02, 'insulin': 0.07, 'bmi': 0.63, 'family': 0.36, 'age': 0.34}


  y = column_or_1d(y, warn=True)


## Manual Recursive Feature Elimination
### Now that we've created a diabetes classifier, let's see if we can reduce the number of features without hurting the model accuracy too much.

### On the second line of code the features are selected from the original DataFrame. Adjust this selection.

### A StandardScaler() instance has been predefined as scaler and a LogisticRegression() one as lr.

### All necessary functions and packages have been pre-loaded too.

### Instructions 1/3
-    First, run the given code, then remove the feature with the lowest model coefficient from X.

In [4]:
# Remove the feature with the lowest model coefficient
X = diabetes_df[['pregnant', 'glucose', 'triceps', 'insulin', 'bmi', 'family', 'age']]

# Performs a 25-75% train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Scales features and fits the logistic regression model
lr.fit(scaler.fit_transform(X_train), y_train)

# Calculates the accuracy on the test set and prints coefficients
acc = accuracy_score(y_test, lr.predict(scaler.transform(X_test)))
print(f"{acc:.1%} accuracy on test set.") 
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

80.6% accuracy on test set.
{'pregnant': 0.05, 'glucose': 1.24, 'triceps': 0.24, 'insulin': 0.2, 'bmi': 0.39, 'family': 0.34, 'age': 0.35}


  y = column_or_1d(y, warn=True)


### Instructions 2/3
-    Run the code and remove 2 more features with the lowest model coefficients.

In [5]:
# Remove the 2 features with the lowest model coefficients
X = diabetes_df[['glucose', 'triceps', 'bmi', 'family', 'age']]

# Performs a 25-75% train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Scales features and fits the logistic regression model
lr.fit(scaler.fit_transform(X_train), y_train)

# Calculates the accuracy on the test set and prints coefficients
acc = accuracy_score(y_test, lr.predict(scaler.transform(X_test)))
print(f"{acc:.1%} accuracy on test set.") 
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

79.6% accuracy on test set.
{'glucose': 1.13, 'triceps': 0.25, 'bmi': 0.34, 'family': 0.34, 'age': 0.37}


  y = column_or_1d(y, warn=True)


### Instructions 3/3
-    Run the code and only keep the feature with the highest coefficient.

In [6]:
# Only keep the feature with the highest coefficient
X = diabetes_df[['glucose']]

# Performs a 25-75% train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Scales features and fits the logistic regression model to the data
lr.fit(scaler.fit_transform(X_train), y_train)

# Calculates the accuracy on the test set and prints coefficients
acc = accuracy_score(y_test, lr.predict(scaler.transform(X_test)))
print(f"{acc:.1%} accuracy on test set.")  
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

75.5% accuracy on test set.
{'glucose': 1.28}


  y = column_or_1d(y, warn=True)


## Automatic Recursive Feature Elimination
### Now let's automate this recursive process. Wrap a Recursive Feature Eliminator (RFE) around our logistic regression estimator and pass it the desired number of features.

### All the necessary functions and packages have been pre-loaded and the features have been scaled for you.

### Instructions
-    Create the RFE with a LogisticRegression() estimator and 3 features to select.
-    Print the features and their ranking.
-    Print the features that are not eliminated.

In [7]:
from sklearn.feature_selection import RFE

predictors_vars = ['pregnant','glucose','diastolic','triceps','insulin','bmi','family','age']
target_var = ['test']

X = diabetes_df[predictors_vars]
y = diabetes_df[target_var]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [8]:
# Create the RFE with a LogisticRegression estimator and 3 features to select
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=3, verbose=1)

# Fits the eliminator to the data
rfe.fit(X_train, y_train)

# Print the features and their ranking (high = dropped early on)
print(dict(zip(X.columns, rfe.ranking_)))

# Print the features that are not eliminated
print(X.columns[rfe.support_])

# Calculates the test set accuracy
acc = accuracy_score(y_test, rfe.predict(X_test))
print(f"{acc:.1%} accuracy on test set.") 

Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
{'pregnant': 2, 'glucose': 3, 'diastolic': 5, 'triceps': 4, 'insulin': 6, 'bmi': 1, 'family': 1, 'age': 1}
Index(['bmi', 'family', 'age'], dtype='object')
72.9% accuracy on test set.


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


## Building a random forest model
### You'll again work on the Pima Indians dataset to predict whether an individual has diabetes. This time using a random forest classifier. You'll fit the model on the training data after performing the train-test split and consult the feature importance values.

### The feature and target datasets have been pre-loaded for you as X and y. Same goes for the necessary packages and functions.

### Instructions
-    Set a 25% test size to perform a 75%-25% train-test split.
-    Fit the random forest classifier to the training data.
-    Calculate the accuracy on the test set.
-    Print the feature importances per feature.

In [9]:
# Perform a 75% training and 25% test data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Fit the random forest model to the training data
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

# Calculate the accuracy
acc = accuracy_score(y_test, rf.predict(X_test))

# Print the importances per feature
print(dict(zip(X.columns, rf.feature_importances_.round(2))))

# Print accuracy
print(f"{acc:.1%} accuracy on test set.") 

  return fit_method(estimator, *args, **kwargs)


{'pregnant': 0.07, 'glucose': 0.25, 'diastolic': 0.09, 'triceps': 0.09, 'insulin': 0.14, 'bmi': 0.12, 'family': 0.12, 'age': 0.13}
79.6% accuracy on test set.


## Random forest for feature selection
### Now lets use the fitted random model to select the most important features from our input dataset X.

### The trained model from the previous exercise has been pre-loaded for you as rf.

### Instructions 1/2
-    Create a mask for features with an importance higher than 0.15.

In [10]:
# Create a mask for features importances above the threshold
mask = rf.feature_importances_ > 0.15

# Prints out the mask
print(mask)

[False  True False False False False False False]


### Instructions 2/2
-    Sub-select the most important features by applying the mask to X.

In [11]:
# Create a mask for features importances above the threshold
mask = rf.feature_importances_ > 0.15

# Apply the mask to the feature dataset X
reduced_X = X.loc[:, mask]

# prints out the selected column names
print(reduced_X.columns)

Index(['glucose'], dtype='object')


## Recursive Feature Elimination with random forests
### You'll wrap a Recursive Feature Eliminator around a random forest model to remove features step by step. This method is more conservative compared to selecting features after applying a single importance threshold. Since dropping one feature can influence the relative importances of the others.

### You'll need these pre-loaded datasets: X, X_train, y_train.

### Functions and classes that have been pre-loaded for you are: RandomForestClassifier(), RFE(), train_test_split().

### Instructions 1/4
-    Create a recursive feature eliminator that will select the 2 most important features using a random forest model.

In [12]:
# Wrap the feature eliminator around the random forest model
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=2, verbose=1)

### Instructions 2/4
-    Fit the recursive feature eliminator to the training data.

In [13]:
# Wrap the feature eliminator around the random forest model
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=2, verbose=1)

# Fit the model to the training data
rfe.fit(X_train, y_train)

Fitting estimator with 8 features.
Fitting estimator with 7 features.


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Fitting estimator with 6 features.
Fitting estimator with 5 features.


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Fitting estimator with 4 features.
Fitting estimator with 3 features.


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


### Instructions 3/4
-    Create a mask using the fitted eliminator's support_ attribute, then apply it to the feature dataset X.

In [14]:
# Wrap the feature eliminator around the random forest model
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=2, verbose=1)

# Fit the model to the training data
rfe.fit(X_train, y_train)

# Create a mask using the support_ attribute of rfe
mask = rfe.support_

# Apply the mask to the feature dataset X and print the result
reduced_X = X.loc[:, mask]
print(reduced_X.columns)

  return fit_method(estimator, *args, **kwargs)


Fitting estimator with 8 features.
Fitting estimator with 7 features.


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Fitting estimator with 6 features.
Fitting estimator with 5 features.


  return fit_method(estimator, *args, **kwargs)


Fitting estimator with 4 features.
Fitting estimator with 3 features.


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Index(['glucose', 'insulin'], dtype='object')


  return fit_method(estimator, *args, **kwargs)


### Instructions 4/4
-    Change the settings of RFE() to eliminate 2 features at each step.

In [15]:
# Set the feature eliminator to remove 2 features on each step
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=2, step=2, verbose=1)

# Fit the model to the training data
rfe.fit(X_train, y_train)

# Create a mask
mask = rfe.support_

# Apply the mask to the feature dataset X and print the result
reduced_X = X.loc[:, mask]
print(reduced_X.columns)

  return fit_method(estimator, *args, **kwargs)


Fitting estimator with 8 features.
Fitting estimator with 6 features.


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Fitting estimator with 4 features.


  return fit_method(estimator, *args, **kwargs)


Index(['glucose', 'bmi'], dtype='object')


## Creating a LASSO regressor
### You'll be working on the numeric ANSUR body measurements dataset to predict a persons Body Mass Index (BMI) using the pre-imported Lasso() regressor. BMI is a metric derived from body height and weight but those two features have been removed from the dataset to give the model a challenge.

### You'll standardize the data first using the StandardScaler() that has been instantiated for you as scaler to make sure all coefficients face a comparable regularizing force trying to bring them down.

### All necessary functions and classes plus the input datasets X and y have been pre-loaded.

### Instructions
-    Set the test size to 30% to get a 70-30% train test split.
-    Fit the scaler on the training features and transform these in one go.
-    Create the Lasso model.
-    Fit it to the scaled training data.

In [16]:
from sklearn.linear_model import Lasso

cols = ['abdominalextensiondepthsitting', 'acromialheight', 'acromionradialelength', 'anklecircumference', 'axillaheight', 'balloffootcircumference', 'balloffootlength', 'biacromialbreadth',
       'bicepscircumferenceflexed', 'bicristalbreadth', 'bideltoidbreadth', 'bimalleolarbreadth', 'bitragionchinarc', 'bitragionsubmandibulararc', 'bizygomaticbreadth', 'buttockcircumference',
       'buttockdepth', 'buttockheight', 'buttockkneelength', 'buttockpopliteallength', 'calfcircumference', 'cervicaleheight', 'chestbreadth', 'chestcircumference', 'chestdepth', 'chestheight',
       'crotchheight', 'crotchlengthomphalion', 'crotchlengthposterioromphalion', 'earbreadth', 'earlength', 'earprotrusion', 'elbowrestheight', 'eyeheightsitting', 'footbreadthhorizontal',
       'footlength', 'forearmcenterofgriplength', 'forearmcircumferenceflexed', 'forearmforearmbreadth', 'forearmhandlength', 'functionalleglength', 'handbreadth', 'handcircumference', 'handlength',
       'headbreadth', 'headcircumference', 'headlength', 'heelanklecircumference', 'heelbreadth', 'hipbreadth', 'hipbreadthsitting', 'iliocristaleheight', 'interpupillarybreadth', 'interscyei',
       'interscyeii', 'kneeheightmidpatella', 'kneeheightsitting', 'lateralfemoralepicondyleheight', 'lateralmalleolusheight', 'lowerthighcircumference', 'mentonsellionlength', 'neckcircumference',
       'neckcircumferencebase', 'overheadfingertipreachsitting', 'palmlength', 'poplitealheight', 'radialestylionlength', 'shouldercircumference', 'shoulderelbowlength', 'shoulderlength',
       'sittingheight', 'sleevelengthspinewrist', 'sleeveoutseam', 'span', 'suprasternaleheight', 'tenthribheight', 'thighcircumference', 'thighclearance', 'thumbtipreach', 'tibialheight',
       'tragiontopofhead', 'trochanterionheight', 'verticaltrunkcircumferenceusa', 'waistbacklength', 'waistbreadth', 'waistcircumference', 'waistdepth', 'waistfrontlengthsitting',
       'waistheightomphalion', 'wristcircumference', 'wristheight']

ansur = pd.concat([ansur_male, ansur_female])
X = ansur[cols]
y = ansur['BMI']

scaler = StandardScaler()

In [17]:
# Set the test size to 30% to get a 70-30% train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Fit the scaler on the training features and transform these in one go
X_train_std = scaler.fit_transform(X_train)

# Create the Lasso model
la = Lasso()

# Fit it to the standardized training data
la.fit(X_train_std, y_train)

## Lasso model results
### Now that you've trained the Lasso model, you'll score its predictive capacity (R^2) on the test set and count how many features are ignored because their coefficient is reduced to zero.

### The X_test and y_test datasets have been pre-loaded for you.

### The Lasso() model and StandardScaler() have been instantiated as la and scaler respectively and both were fitted to the training data.

### Instructions
-    Transform the test set with the pre-fitted scaler.
-    Calculate the R^2 value on the scaled test data.
-    Create a list that has True values when coefficients equal 0.
-    Calculate the total number of features with a coefficient of 0.

In [18]:
# Transform the test set with the pre-fitted scaler
X_test_std = scaler.transform(X_test)

# Calculate the coefficient of determination (R squared) on X_test_std
r_squared = la.score(X_test_std, y_test)
print(f"The model can predict {r_squared:.1%} of the variance in the test set.")

# Create a list that has True values when coefficients equal 0
zero_coef = la.coef_ == 0

# Calculate how many features have a zero coefficient
n_ignored = sum(zero_coef)
print(f"The model has ignored {n_ignored} out of {len(la.coef_)} features.")

The model can predict 82.9% of the variance in the test set.
The model has ignored 81 out of 91 features.


## Adjusting the regularization strength
### Your current Lasso model has an R^2 score of 84.7%. When a model applies overly powerful regularization it can suffer from high bias, hurting its predictive power.

### Let's improve the balance between predictive power and model simplicity by tweaking the alpha parameter.

### Instructions
-    Find the highest value for alpha that gives an R^2 value above 98% from the options: 1, 0.5, 0.1, and 0.01.

In [19]:
# Find the highest alpha value with R-squared above 98%
la = Lasso(alpha=0.1, random_state=0)

# Fits the model and calculates performance stats
la.fit(X_train_std, y_train)
r_squared = la.score(X_test_std, y_test)
n_ignored_features = sum(la.coef_ == 0)

# Print peformance stats 
print(f"The model can predict {r_squared:.1%} of the variance in the test set.")
print(f"{n_ignored_features} out of {len(la.coef_)} features were ignored.")

The model can predict 97.8% of the variance in the test set.
68 out of 91 features were ignored.


## Creating a LassoCV regressor
### You'll be predicting biceps circumference on a subsample of the male ANSUR dataset using the LassoCV() regressor that automatically tunes the regularization strength (alpha value) using Cross-Validation.

### The standardized training and test data has been pre-loaded for you as X_train, X_test, y_train, and y_test.

### Instructions
-    Create and fit the LassoCV model on the training set.
-    Calculate R^2 on the test set.
-    Create a mask for coefficients not equal to zero.

In [20]:
from sklearn.linear_model import LassoCV

# Create and fit the LassoCV model on the training set
lcv = LassoCV()
lcv.fit(X_train, y_train)
print(f'Optimal alpha = {lcv.alpha_:.3f}')

# Calculate R squared on the test set
r_squared = lcv.score(X_test, y_test)
print(f'The model explains {r_squared:.1%} of the test set variance')

# Create a mask for coefficients not equal to zero
lcv_mask = lcv.coef_ != 0
print(f'{sum(lcv_mask)} features out of {len(lcv_mask)} selected')

Optimal alpha = 0.406
The model explains 98.6% of the test set variance
41 features out of 91 selected


  model = cd_fast.enet_coordinate_descent(


## Ensemble models for extra votes
### The LassoCV() model selected 22 out of 32 features. Not bad, but not a spectacular dimensionality reduction either. Let's use two more models to select the 10 features they consider most important using the Recursive Feature Eliminator (RFE).

### The standardized training and test data has been pre-loaded for you as X_train, X_test, y_train, and y_test.

### Instructions 1/4
-    Select 10 features with RFE on a GradientBoostingRegressor and drop 3 features on each step.

### Instructions 2/4
Calculate the R^2 on the test set.

In [21]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor

# Select 10 features with RFE on a GradientBoostingRegressor, drop 3 features on each step
rfe_gb = RFE(estimator=GradientBoostingRegressor(), 
             n_features_to_select=10, step=3, verbose=1)
rfe_gb.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_gb.score(X_test, y_test)
print(f'The model can explain {r_squared:.1%} of the variance in the test set')

Fitting estimator with 91 features.
Fitting estimator with 88 features.
Fitting estimator with 85 features.
Fitting estimator with 82 features.
Fitting estimator with 79 features.
Fitting estimator with 76 features.
Fitting estimator with 73 features.
Fitting estimator with 70 features.
Fitting estimator with 67 features.
Fitting estimator with 64 features.
Fitting estimator with 61 features.
Fitting estimator with 58 features.
Fitting estimator with 55 features.
Fitting estimator with 52 features.
Fitting estimator with 49 features.
Fitting estimator with 46 features.
Fitting estimator with 43 features.
Fitting estimator with 40 features.
Fitting estimator with 37 features.
Fitting estimator with 34 features.
Fitting estimator with 31 features.
Fitting estimator with 28 features.
Fitting estimator with 25 features.
Fitting estimator with 22 features.
Fitting estimator with 19 features.
Fitting estimator with 16 features.
Fitting estimator with 13 features.
The model can explain 96.9% 

### Instructions 3/4
-    Assign the support array of the fitted model to gb_mask.

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor

# Select 10 features with RFE on a GradientBoostingRegressor, drop 3 features on each step
rfe_gb = RFE(estimator=GradientBoostingRegressor(), 
             n_features_to_select=10, step=3, verbose=1)
rfe_gb.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_gb.score(X_test, y_test)
print(f'The model can explain {r_squared:.1%} of the variance in the test set')

# Assign the support array to gb_mask
gb_mask = rfe_gb.support_

Fitting estimator with 91 features.
Fitting estimator with 88 features.
Fitting estimator with 85 features.
Fitting estimator with 82 features.
Fitting estimator with 79 features.


### Instructions 4/4
-    Modify the first step to select 10 features with RFE on a RandomForestRegressor() and drop 3 features on each step.

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor

# Select 10 features with RFE on a RandomForestRegressor, drop 3 features on each step
rfe_rf = RFE(estimator=RandomForestRegressor(), 
             n_features_to_select=10, step=3, verbose=1)
rfe_rf.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_rf.score(X_test, y_test)
print(f'The model can explain {r_squared:.1%} of the variance in the test set')

# Assign the support array to rf_mask
rf_mask = rfe_rf.support_

## Combining 3 feature selectors
### We'll combine the votes of the 3 models you built in the previous exercises, to decide which features are important into a meta mask. We'll then use this mask to reduce dimensionality and see how a simple linear regressor performs on the reduced dataset.

### The per model votes have been pre-loaded as lcv_mask, rf_mask, and gb_mask and the feature and target datasets as X and y.

### Instructions 1/4
-    Sum the votes of the three models using np.sum().

In [None]:
# Sum the votes of the three models
votes = np.sum([lcv_mask, rf_mask, gb_mask], axis=0)
print(votes)

### Instructions 2/4
-    Create a mask for features selected by all 3 models

In [None]:
# Sum the votes of the three models
votes = np.sum([lcv_mask, rf_mask, gb_mask], axis=0)

# Create a mask for features selected by all 3 models
meta_mask = votes>= 3
print(meta_mask)

### Instructions 3/4
-    Apply the dimensionality reduction on X and print which features were selected.

In [None]:
# Sum the votes of the three models
votes = np.sum([lcv_mask, rf_mask, gb_mask], axis=0)

# Create a mask for features selected by all 3 models
meta_mask = votes == 3

# Apply the dimensionality reduction on X
X_reduced = X.loc[:, meta_mask]
print(X_reduced.columns)

### Instructions 4/4
-    Plug the reduced dataset into the code for simple linear regression that has been written for you.

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

In [None]:
# Sum the votes of the three models
votes = np.sum([lcv_mask, rf_mask, gb_mask], axis=0)

# Create a mask for features selected by all 3 models
meta_mask = votes == 3

# Apply the dimensionality reduction on X
X_reduced = X.loc[:, meta_mask]

# Plug the reduced dataset into a linear regression pipeline
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=0)
lm.fit(scaler.fit_transform(X_train), y_train)
r_squared = lm.score(scaler.transform(X_test), y_test)
print(f'The model can explain {r_squared:.1%} of the variance in the test set using {len(lm.coef_)} features.')