In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from pyprojroot import here

In [2]:
acs = pd.read_csv(here("data/acs_ny.csv"))
acs_dummy = pd.get_dummies(acs)


In [3]:
y = acs_dummy.pop("FamilyIncome")

X_train, X_test, y_train, y_test = train_test_split(
    acs_dummy, y, random_state=42
)

In [4]:
np.mean(cross_val_score(LinearRegression(),
                        X_train, y_train, cv=10))

0.3428980531486573

In [5]:
np.mean(cross_val_score(
        Ridge(), X_train, y_train, cv=10))

0.3429179829718705

In [6]:
param_grid = {'alpha': np.logspace(-3, 3, 14)} # Return numbers spaced evenly on a log scale
print(param_grid)

{'alpha': array([1.00000000e-03, 2.89426612e-03, 8.37677640e-03, 2.42446202e-02,
       7.01703829e-02, 2.03091762e-01, 5.87801607e-01, 1.70125428e+00,
       4.92388263e+00, 1.42510267e+01, 4.12462638e+01, 1.19377664e+02,
       3.45510729e+02, 1.00000000e+03])}


In [7]:
grid = GridSearchCV(Ridge(),
                    param_grid, cv=10, return_train_score=True)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': array([1.00000000e-03, 2.89426612e-03, 8.37677640e-03, 2.42446202e-02,
       7.01703829e-02, 2.03091762e-01, 5.87801607e-01, 1.70125428e+00,
       4.92388263e+00, 1.42510267e+01, 4.12462638e+01, 1.19377664e+02,
       3.45510729e+02, 1.00000000e+03])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [8]:
print(grid.best_params_)
print(grid.best_score_)

{'alpha': 41.246263829013564}
0.34305471808738697


In [9]:
from sklearn.linear_model import Lasso

lasso = Lasso().fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso.score(X_test, y_test)))
print("Number of features used:", np.sum(lasso.coef_ != 0))

Training set score: 0.35
Test set score: 0.33
Number of features used: 44


In [10]:
#param_grid = {'l1_ratio': np.linspace(0, 1, 11)} # Return numbers spaced evenly
#param_grid

param_grid = {'l1_ratio': np.logspace(-3, 3, 14), 'alpha': [1]}
param_grid

{'l1_ratio': array([1.00000000e-03, 2.89426612e-03, 8.37677640e-03, 2.42446202e-02,
        7.01703829e-02, 2.03091762e-01, 5.87801607e-01, 1.70125428e+00,
        4.92388263e+00, 1.42510267e+01, 4.12462638e+01, 1.19377664e+02,
        3.45510729e+02, 1.00000000e+03]),
 'alpha': [1]}

In [11]:
from sklearn.linear_model import ElasticNet
grid = GridSearchCV(ElasticNet(),
                    param_grid, cv=5, return_train_score=True)
grid.fit(X_train, y_train)

  positive)
  positive)
  positive)
  positive)
  positive)


GridSearchCV(cv=5, error_score=nan,
             estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True,
                                  l1_ratio=0.5, max_iter=1000, normalize=False,
                                  positive=False, precompute=False,
                                  random_state=None, selection='cyclic',
                                  tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [1],
                         'l1_ratio': array([1.00000000e-03, 2.89426612e-03, 8.37677640e-03, 2.42446202e-02,
       7.01703829e-02, 2.03091762e-01, 5.87801607e-01, 1.70125428e+00,
       4.92388263e+00, 1.42510267e+01, 4.12462638e+01, 1.19377664e+02,
       3.45510729e+02, 1.00000000e+03])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [12]:
print(grid.best_params_)
print(grid.best_score_)

{'alpha': 1, 'l1_ratio': 0.5878016072274912}
0.33001661366586454


model_coefs