Import pandas and numpy

In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

Question1: Print pandas version in Anaconda local setup

In [41]:
pd.__version__

'2.2.2'

Import dataset from url

In [43]:
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv'
laptop_data = pd.read_csv(url)

# Question 1: How many records in the dataset?

In [45]:
laptop_data.columns = laptop_data.columns.str.lower().str.replace(' ', '_')

In [47]:
laptop_data_prepared = laptop_data[['ram', 'storage', 'screen', 'final_price']]

In [31]:
# Display the first few rows of the prepared dataset
laptop_data_prepared.head()

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.0
1,8,256,15.6,299.0
2,8,256,15.6,789.0
3,16,1000,15.6,1199.0
4,16,512,15.6,669.01


In [49]:
# Check for missing values in the prepared dataset
missing_values_prepared = laptop_data_prepared.isnull().sum()
missing_values_prepared

ram            0
storage        0
screen         4
final_price    0
dtype: int64

Answer1 - screen column has missing values

In [None]:
# Question 2:  Median of 'ram'

In [53]:
ram_median = laptop_data_prepared['ram'].median()
ram_median

16.0

In [None]:
# Question 3: Which option gives better RMSE

In [55]:
# Function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Split the dataset into train (60%) and validation (20%), and test (20%)
train_data, temp_data = train_test_split(laptop_data_prepared, test_size=0.4, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Option 1: Fill missing values with 0
train_data_zero = train_data.copy()
train_data_zero['screen'] = train_data_zero['screen'].fillna(0)

val_data_zero = val_data.copy()
val_data_zero['screen'] = val_data_zero['screen'].fillna(0)

# Prepare feature matrix and target vector for zero-filled data
X_train_zero = train_data_zero[['ram', 'storage', 'screen']]
y_train_zero = train_data_zero['final_price']
X_val_zero = val_data_zero[['ram', 'storage', 'screen']]
y_val_zero = val_data_zero['final_price']

# Train linear regression model on zero-filled data
model_zero = LinearRegression()
model_zero.fit(X_train_zero, y_train_zero)

# Predict on validation set for zero-filled model
y_pred_zero = model_zero.predict(X_val_zero)

# Calculate RMSE for zero-filled model
rmse_zero = rmse(y_val_zero, y_pred_zero)

# Option 2: Fill missing values with the mean (computed from training data)
screen_mean = train_data['screen'].mean()

train_data_mean = train_data.copy()
train_data_mean['screen'] = train_data_mean['screen'].fillna(screen_mean)

val_data_mean = val_data.copy()
val_data_mean['screen'] = val_data_mean['screen'].fillna(screen_mean)

# Prepare feature matrix and target vector for mean-filled data
X_train_mean = train_data_mean[['ram', 'storage', 'screen']]
y_train_mean = train_data_mean['final_price']
X_val_mean = val_data_mean[['ram', 'storage', 'screen']]
y_val_mean = val_data_mean['final_price']

# Train linear regression model on mean-filled data
model_mean = LinearRegression()
model_mean.fit(X_train_mean, y_train_mean)

# Predict on validation set for mean-filled model
y_pred_mean = model_mean.predict(X_val_mean)

# Calculate RMSE for mean-filled model
rmse_mean = rmse(y_val_mean, y_pred_mean)

# Output the results
print(f"RMSE with missing values filled with 0: {round(rmse_zero, 2)}")
print(f"RMSE with missing values filled with mean: {round(rmse_mean, 2)}")

# Compare which option gives better RMSE
if rmse_zero < rmse_mean:
    print("Filling missing values with 0 gives a better RMSE.")
elif rmse_mean < rmse_zero:
    print("Filling missing values with mean gives a better RMSE.")
else:
    print("Both options give equally good RMSE.")

RMSE with missing values filled with 0: 609.42
RMSE with missing values filled with mean: 610.02
Filling missing values with 0 gives a better RMSE.


# Question 4: what is best RMSE, smallest r if multiple?

In [57]:
from sklearn.linear_model import Ridge

# Regularization values to try
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]

# Fill missing values with 0 (as per the instructions)
train_data_zero = train_data.copy()
train_data_zero['screen'] = train_data_zero['screen'].fillna(0)

val_data_zero = val_data.copy()
val_data_zero['screen'] = val_data_zero['screen'].fillna(0)

# Prepare feature matrix and target vector for zero-filled data
X_train_zero = train_data_zero[['ram', 'storage', 'screen']]
y_train_zero = train_data_zero['final_price']
X_val_zero = val_data_zero[['ram', 'storage', 'screen']]
y_val_zero = val_data_zero['final_price']

# Dictionary to store RMSE values for different r
rmse_values = {}

# Train models with different regularization strengths (r)
for r in r_values:
    # Train Ridge regression model with regularization strength r
    model_ridge = Ridge(alpha=r)
    model_ridge.fit(X_train_zero, y_train_zero)
    
    # Predict on validation set
    y_pred_ridge = model_ridge.predict(X_val_zero)
    
    # Calculate RMSE and store it
    rmse_ridge = rmse(y_val_zero, y_pred_ridge)
    rmse_values[r] = round(rmse_ridge, 2)

# Find the r with the smallest RMSE
best_r = min(rmse_values, key=rmse_values.get)

# Output the RMSE values for each r and the best one
print("RMSE values for different r:")
for r, rmse_value in rmse_values.items():
    print(f"r = {r}: RMSE = {rmse_value}")

print(f"\nBest RMSE is obtained with r = {best_r} and RMSE = {rmse_values[best_r]}")


RMSE values for different r:
r = 0: RMSE = 609.42
r = 0.01: RMSE = 609.42
r = 0.1: RMSE = 609.42
r = 1: RMSE = 609.42
r = 5: RMSE = 609.42
r = 10: RMSE = 609.43
r = 100: RMSE = 609.53

Best RMSE is obtained with r = 0 and RMSE = 609.42


Answer - r=0

In [None]:
# Question 5: rmse on test dataset

In [71]:
# Function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Function to prepare feature matrix X by filling missing values with the provided value
def prepare_X(df, fillna_value):
    df = df.fillna(fillna_value)
    X = df.values
    return X

# List to store RMSE scores for each seed
rmse_scores = []

# Iterate over the seeds [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
for seed in range(10):
    # Shuffle and split the dataset into 60% train, 20% validation, and 20% test
    n = len(laptop_data_prepared)
    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)
    
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    df_shuffled = laptop_data_prepared.iloc[idx]
    
    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train + n_val].copy()
    df_test = df_shuffled.iloc[n_train + n_val:].copy()
    
    # Prepare the target vectors
    y_train = df_train['final_price'].values
    y_val = df_val['final_price'].values
    y_test = df_test['final_price'].values

    # Remove the target column from the datasets
    del df_train['final_price']
    del df_val['final_price']
    del df_test['final_price']
    
    # Prepare the feature matrices by filling missing values with 0
    X_train = prepare_X(df_train, fillna_value=0)
    X_val = prepare_X(df_val, fillna_value=0)
    
    # Train the linear regression model without regularization
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_pred = model.predict(X_val)
    
    # Calculate the RMSE for the current seed and append it to the list
    score = rmse(y_val, y_pred)
    rmse_scores.append(score)
    
    print(seed, score)

# Calculate the standard deviation of RMSE scores and round to 3 decimal places
rmse_std = np.std(rmse_scores)
rmse_std_rounded = round(rmse_std, 3)

# Output the RMSE scores and the standard deviation
print(f"RMSE scores for different seeds: {rmse_scores}")
print(f"Standard deviation of RMSE scores: {rmse_std_rounded}")


0 565.4520868770983
1 636.7985423056717
2 588.9558697907967
3 597.8148920012524
4 571.9627915111035
5 573.2383256618941
6 647.3438328407256
7 550.4398184485931
8 587.3335036169915
9 576.1017929433117
RMSE scores for different seeds: [565.4520868770983, 636.7985423056717, 588.9558697907967, 597.8148920012524, 571.9627915111035, 573.2383256618941, 647.3438328407256, 550.4398184485931, 587.3335036169915, 576.1017929433117]
Standard deviation of RMSE scores: 29.176


Question 6 : whats the rmse on test dataset

In [73]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Function to prepare feature matrix X by filling missing values with the provided value
def prepare_X(df, fillna_value):
    df = df.fillna(fillna_value)
    X = df.values
    return X

# Use seed 9 to split the dataset into train, validation, and test sets
n = len(laptop_data_prepared)
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

df_shuffled = laptop_data_prepared.iloc[idx]

# Split the data
df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train + n_val].copy()
df_test = df_shuffled.iloc[n_train + n_val:].copy()

# Prepare the target vectors
y_train = df_train['final_price'].values
y_val = df_val['final_price'].values
y_test = df_test['final_price'].values

# Remove the target column from the datasets
del df_train['final_price']
del df_val['final_price']
del df_test['final_price']

# Combine train and validation datasets
df_full_train = pd.concat([df_train, df_val]).reset_index(drop=True)
y_full_train = np.concatenate([y_train, y_val])

# Prepare feature matrix for the combined train/validation data
X_full_train = prepare_X(df_full_train, fillna_value=0)

# Train Ridge regression model with r=0.001 (regularization)
model = Ridge(alpha=0.001)
model.fit(X_full_train, y_full_train)

# Prepare the feature matrix for the test dataset
X_test = prepare_X(df_test, fillna_value=0)

# Make predictions on the test dataset
y_pred_test = model.predict(X_test)

# Calculate RMSE on the test dataset
test_rmse = rmse(y_test, y_pred_test)

# Output the RMSE
print(f"RMSE on the test dataset: {round(test_rmse, 3)}")


RMSE on the test dataset: 608.61
