In [None]:
# Optional: install scikit-learn if not already available in your environment
#!pip install -U scikit-learn

In [None]:
# Dependency installation hints (only needed if your local env is missing packages)
# All libraries required for this lab are listed below. The libraries pre-installed on Skills Network Labs are commented.
#!mamba install -qy pandas==1.3.4 numpy==1.21.4 seaborn==0.9.0 matplotlib==3.5.0 scikit-learn==0.20.1
# Note: If your environment doesn't support "!mamba install", use 
# "!pip install pandas==1.3.4 numpy==1.21.4 seaborn==0.9.0 matplotlib==3.5.0 scikit-learn==0.20.1"

In [None]:
# Suppress warnings to keep the notebook output clean
# (Use cautiously in real projects; warnings can be useful.)
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn


In [None]:
# In-browser (Pyodide) environment: install packages at runtime
# If running locally, you can skip this cell.
import piplite
await piplite.install(['tqdm', 'seaborn', 'pandas', 'numpy', 'scikit-learn'])

In [None]:
# Core scientific stack and visualization
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pylab as plt
%matplotlib inline

# Scikit-learn: model building, evaluation, and utilities
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline


In [None]:
# Download dataset in Pyodide; comment this out if running locally with a local file
from pyodide.http import pyfetch
 
async def download(url, filename):
    response = await pyfetch(url)
    if response.status == 200:
        with open(filename, "wb") as f:
            f.write(await response.bytes())
 
path = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML240EN-SkillsNetwork/labs/encoded_car_data.csv"
 
# You will need to download the dataset; if running locally with the file present, skip the next line
await download(path, "encoded_car_data.csv")
 
 
# Load the dataset into a pandas DataFrame
# Import pandas library
import pandas as pd
 
# Read the downloaded CSV and inspect the head
# Read the online file by the URL provides above, and assign it to variable "df"
data = pd.read_csv("encoded_car_data.csv")
 
# Show the first 5 rows for a quick sanity check
# show the first 5 rows using dataframe.head() method
print("The first 5 rows of the dataframe") 
data.head(5)

In [None]:
# Quick schema/summary of the dataset: dtypes, non-nulls, memory
data.info()

In [None]:
# Separate features (X) and target (y)
X = data.drop(columns=['price'])
y = data['price'].copy()

In [None]:
# Hold-out split for initial evaluation (30% test, fixed random_state for reproducibility)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [None]:
# Initialize a baseline Linear Regression model
lr = LinearRegression()

In [None]:
# Fit the model on the training set
lr.fit(X_train, y_train)

In [None]:
# Generate predictions on the test set
predicted =lr.predict(X_test)


In [None]:
# R^2 on training data (fit quality). Beware of overfitting if this is much higher than test.
lr.score(X_train,y_train)

In [None]:
# R^2 on test data (generalization performance)
lr.score(X_test,y_test)

In [None]:
# Explicit R^2 computation should match lr.score on the test set
print(r2_score(y_true=y_test, y_pred=predicted))

In [None]:
# RMSE: square root of MSE, interpretable in target units
mse = mean_squared_error(y_true=y_test, y_pred=predicted)
rmse = np.sqrt(mse)
rmse

In [None]:
# Take a few samples to compare predictions vs labels
some_data = X.iloc[:3]
some_labels = y.iloc[:3]

In [None]:
# Show predicted values for the small sample
print("Predictions:", lr.predict(some_data))

In [None]:
# Show ground-truth labels for the same sample
print("Labels:", list(some_labels))

In [None]:
# Recompute and display predictions for full test set (for inspection)
predicted =lr.predict(X_test)
predicted

In [None]:
# Pipeline example: scale features then fit Linear Regression
pipe = Pipeline([('ss',StandardScaler() ),('lr', LinearRegression())])
pipe.fit(X_train,y_train)
pipe

In [None]:
# (Intentionally left empty in original)


In [None]:
# Pipeline R^2 on training data
pipe.score(X_train,y_train)

In [None]:
# Pipeline R^2 on test data
pipe.score(X_test,y_test)

In [None]:
# Alternative pipeline: use Normalizer (L2 vector normalization) before Linear Regression
pipe_1 = Pipeline([('nn',Normalizer() ),('lr', LinearRegression())])
pipe_1.fit(X_train, y_train)

# Evaluate pipeline_1
pipe_1.score(X_train,y_train)
pipe_1.score(X_test,y_test)

# Predict and compute RMSE
pred =pipe_1.predict(X_test)

mse = mean_squared_error(y_true=y_test, y_pred=pred)
rmse = np.sqrt(mse)
rmse


In [None]:
# List feature names for simple univariate analysis
features=list(X)
features

In [None]:
# Univariate feature assessment: train and evaluate using one feature at a time (train R^2)
R_2=[]
pipe = Pipeline([('ss',StandardScaler() ),('lr', LinearRegression())])

for feature in features:
    pipe.fit(X_train[[feature]],y_train)
    R_2.append(pipe.score(X_train[[feature]],y_train))
    
    

In [None]:
# Visualize training R^2 by single-feature models
plt.bar(features,R_2)
plt.xticks(rotation=90)
plt.ylabel("$R^2$")
plt.show()

In [None]:
# Identify the best single feature by training R^2
best=features[np.argmax(R_2)]
best

In [None]:
# Fit on the full dataset using only the best single feature (for illustration)
pipe.fit(X[[best]],y)

In [None]:
# Univariate test performance: fit on train, evaluate on test using one feature at a time
R_2=[]

for feature in features:
      lr.fit(X_train[[feature]], y_train)
      R_2.append(lr.score(X_test[[feature]],y_test))

best=features[np.argmax(R_2)]

# Visualize test R^2 across single-feature models
plt.bar(features,R_2)
plt.xticks(rotation=90)
plt.ylabel("$R^2$")
plt.show()

# Print the best feature on the test set
best=features[np.argmax(R_2)]
print(best)

In [None]:
# Number of samples
N=len(X)
N

In [None]:
# Fresh Linear Regression instance for cross-validation
lr = LinearRegression()

In [None]:
# Cross-validation with 3 folds using R^2 as the metric
scores = cross_val_score(lr, X, y, scoring ="r2", cv=3)

In [None]:
# View fold-wise R^2 scores
scores 

In [None]:
# Helper to summarize CV scores
def display_scores(scores, print_=False):
    
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())


In [None]:
# Summarize R^2 CV results
display_scores(scores)

In [None]:
# Cross-validation with 5 folds using RMSE (via neg_mean_squared_error)
scores = cross_val_score(lr, X ,y, scoring ="neg_mean_squared_error", cv=5)
lr_scores = np.sqrt(-scores)
display_scores(lr_scores)

In [None]:
# Manual KFold setup
n_splits=2
kf = KFold(n_splits = n_splits)

In [None]:
# Iterate through KFold splits, fit pipeline, and store R^2 on each test fold
y = data['price'].copy()
X = data.drop(columns=['price'])
R_2 = np.zeros((n_splits,1))
pipe = Pipeline([('ss',StandardScaler() ),('lr', LinearRegression())])
n=0
for k,(train_index, test_index) in enumerate(kf.split(X,y)):
    print("TRAIN:", train_index)
    print("TEST:", test_index)
    X_train, X_test =X.iloc[train_index],X.iloc[test_index]
    
    y_train, y_test=y[train_index],y[test_index]
    pipe.fit(X_train,y_train)
    n=+1
    R_2[k]=pipe.score(X_test, y_test)

In [None]:
# Average R^2 across folds
R_2.mean()

In [None]:
# Repeat KFold with 3 splits and compute mean test R^2
n_splits=3
kf = KFold(n_splits = n_splits)
y = data['price'].copy()
X = data.drop(columns=['price'])
R_2=np.zeros((n_splits,1))
pipe = Pipeline([('ss',StandardScaler() ),('lr', LinearRegression())])
n=0
for k,(train_index, test_index) in enumerate(kf.split(X,y)):
    print("TRAIN:", train_index)
    print("TEST:", test_index)
    X_train, X_test =X.iloc[train_index],X.iloc[test_index]
    
    y_train, y_test=y[train_index],y[test_index]
    pipe.fit(X_train,y_train)
    n=+1
    R_2[k]=pipe.score(X_test, y_test)
    
    
R_2.mean()

In [None]:
# KFold with shuffling; otherwise same evaluation
n_splits=3
kf = KFold(n_splits = n_splits,shuffle=True)
y = data['price'].copy()
X = data.drop(columns=['price'])
R_2=np.zeros((n_splits,1))
pipe = Pipeline([('ss',StandardScaler() ),('lr', LinearRegression())])
n=0
for k,(train_index, test_index) in enumerate(kf.split(X,y)):
    print("TRAIN:", train_index)
    print("TEST:", test_index)
    X_train, X_test =X.iloc[train_index],X.iloc[test_index]

    y_train, y_test=y[train_index],y[test_index]
    pipe.fit(X_train,y_train)
    n=+1
    R_2[k]=pipe.score(X_test, y_test)

R_2.mean()
