In [None]:
# Install required Python packages in the notebook/pyodide environment
import piplite
# tqdm: progress bars, seaborn: plotting, pandas: dataframes, numpy: arrays
await piplite.install(['tqdm', 'seaborn', 'pandas', 'numpy'])

In [None]:
# Core scientific Python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Render plots inline in the notebook
%matplotlib inline

# Suppress non-critical warnings to keep the output clean
import warnings

def warn(*args, **kwargs):
    pass
warnings.warn = warn

# Display numpy arrays with 3 decimals and without scientific notation
np.set_printoptions(precision=3, suppress=True)

In [None]:
np.random.seed(72018)



def to_2d(array):
    return array.reshape(array.shape[0], -1)


    
def plot_exponential_data():
    data = np.exp(np.random.normal(size=1000))
    plt.hist(data)
    plt.show()
    return data
    
def plot_square_normal_data():
    data = np.square(np.random.normal(loc=5, size=1000))
    plt.hist(data)
    plt.show()
    return data

In [None]:
# Helper to download files when running in the browser (pyodide)
from pyodide.http import pyfetch

async def download(url, filename):
    response = await pyfetch(url)
    if response.status == 200:
        with open(filename, "wb") as f:
            f.write(await response.bytes())

# Remote path to the prepared Boston Housing dataset
path = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML240EN-SkillsNetwork/labs/data/boston_housing_clean.pickle"

# If running locally, you may already have the file and can skip this line
await download(path, "boston_housing_clean.pickle")

# Import pandas library (used shortly to load the pickle)
import pandas as pd

In [None]:
# Load the cleaned Boston Housing dataset from the downloaded pickle
with open('boston_housing_clean.pickle', 'rb') as to_read:
    boston = pd.read_pickle(to_read)

# Extract the main DataFrame and its description metadata
boston_data = boston['dataframe']
boston_description = boston['description']

# Preview the first 5 rows
print("The first 5 rows of the dataframe")
boston_data.head()

In [None]:
y_col = "MEDV"

X = boston_data.drop(y_col, axis=1)
y = boston_data[y_col]

In [None]:
from sklearn.preprocessing import StandardScaler

s = StandardScaler()
X_ss = s.fit_transform(X)

In [None]:
#Hint:

a = np.array([[1, 2, 3], 
              [4, 5, 6]]) 
print(a) # 2 rows, 3 columns

In [None]:
a.mean(axis=0) # mean along the *columns*

In [None]:
a.mean(axis=1) # mean along the *rows*

In [None]:
### BEGIN SOLUTION
X2 = np.array(X)
man_transform = (X2-X2.mean(axis=0))/X2.std(axis=0)
np.allclose(man_transform, X_ss)
### END SOLUTION

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

y_col = "MEDV"

X = boston_data.drop(y_col, axis=1)
y = boston_data[y_col]

In [None]:
lr.fit(X, y)
print(lr.coef_) # min = -18

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
s = StandardScaler()
X_ss = s.fit_transform(X)

In [None]:
lr2 = LinearRegression()
lr2.fit(X_ss, y)
print(lr2.coef_) # coefficients now "on the same scale"

In [None]:
### BEGIN SOLUTION
pd.DataFrame(zip(X.columns, lr2.coef_)).sort_values(by=1)
### END SOLUTION

In [None]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures

In [None]:
pf = PolynomialFeatures(degree=2, include_bias=False,)
X_pf = pf.fit_transform(X)

In [None]:
X_pf_ss = s.fit_transform(X_pf)

In [None]:
las = Lasso()
las.fit(X_pf_ss, y)
las.coef_ 

In [None]:
### BEGIN SOLUTION
las01 = Lasso(alpha = 0.1)
las01.fit(X_pf_ss, y)
print('sum of coefficients:', abs(las01.coef_).sum() )
print('number of coefficients not equal to 0:', (las01.coef_!=0).sum())

In [None]:
las1 = Lasso(alpha = 1)
las1.fit(X_pf_ss, y)
print('sum of coefficients:',abs(las1.coef_).sum() )
print('number of coefficients not equal to 0:',(las1.coef_!=0).sum())
### END SOLUTION

In [None]:
### BEGIN SOLUTION
from sklearn.metrics import r2_score
r2_score(y,las.predict(X_pf_ss))
### END SOLUTION

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pf, y, test_size=0.3, 
                                                    random_state=72018)

In [None]:
X_train_s = s.fit_transform(X_train)
las.fit(X_train_s, y_train)
X_test_s = s.transform(X_test)
y_pred = las.predict(X_test_s)
r2_score(y_test, y_pred)

In [None]:
X_train_s = s.fit_transform(X_train)
las01.fit(X_train_s, y_train)
X_test_s = s.transform(X_test)
y_pred = las01.predict(X_test_s)
r2_score(y_test, y_pred)

In [None]:
### BEGIN SOLUTION

# Part 1

# Decreasing regularization and ensuring convergence
las001 = Lasso(alpha = 0.001, max_iter=100000)

# Transforming training set to get standardized units
X_train_s = s.fit_transform(X_train)

# Fitting model to training set
las001.fit(X_train_s, y_train)

# Transforming test set using the parameters defined from training set
X_test_s = s.transform(X_test)

# Finding prediction on test set
y_pred = las001.predict(X_test_s)

# Calculating r2 score
print("r2 score for alpha = 0.001:", r2_score(y_test, y_pred))


# Part 2

# Using vanilla Linear Regression
lr = LinearRegression()

# Fitting model to training set
lr.fit(X_train_s, y_train)

# predicting on test set
y_pred_lr = lr.predict(X_test_s)

# Calculating r2 score
print("r2 score for Linear Regression:", r2_score(y_test,y_pred_lr))


# Part 3
print('Magnitude of Lasso coefficients:', abs(las001.coef_).sum())
print('Number of coeffients not equal to 0 for Lasso:', (las001.coef_!=0).sum())

print('Magnitude of Linear Regression coefficients:', abs(lr.coef_).sum())
print('Number of coeffients not equal to 0 for Linear Regression:', (lr.coef_!=0).sum())
### END SOLUTION

In [None]:
from sklearn.linear_model import Ridge

In [None]:
### BEGIN SOLUTION
# Decreasing regularization and ensuring convergence
r = Ridge(alpha = 0.001)
X_train_s = s.fit_transform(X_train)
r.fit(X_train_s, y_train)
X_test_s = s.transform(X_test)
y_pred_r = r.predict(X_test_s)

# Calculating r2 score
r.coef_
### END SOLUTION

In [None]:
las001 # same alpha as Ridge above

In [None]:
las001.coef_

In [None]:
print(np.sum(np.abs(r.coef_)))
print(np.sum(np.abs(las001.coef_)))

print(np.sum(r.coef_ != 0))
print(np.sum(las001.coef_ != 0))

In [None]:
y_pred = r.predict(X_pf_ss)
print(r2_score(y, y_pred))

y_pred = las001.predict(X_pf_ss)
print(r2_score(y, y_pred))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_ss, y, test_size=0.3, 
                                                    random_state=72018)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
r2_score(y_test, y_pred)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=72018)

In [None]:
s = StandardScaler()
lr_s = LinearRegression()
X_train_s = s.fit_transform(X_train)
lr_s.fit(X_train_s, y_train)
X_test_s = s.transform(X_test)
y_pred_s = lr_s.predict(X_test_s)
r2_score(y_test, y_pred)