In [1]:
import pandas as pd
import numpy as np

### Q1

In [2]:
fname = 'laptops.csv'
df = pd.read_csv(fname)

In [3]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [4]:
df = df[['ram', 'storage', 'screen', 'final_price']]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2160 entries, 0 to 2159
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ram          2160 non-null   int64  
 1   storage      2160 non-null   int64  
 2   screen       2156 non-null   float64
 3   final_price  2160 non-null   float64
dtypes: float64(2), int64(2)
memory usage: 67.6 KB


### Q2

In [6]:
df['ram'].median()

16.0

In [27]:
def split_dataset(df, seed):
    np.random.seed(seed)

    n = len(df)

    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx]

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()
    
    return df_train, df_val, df_test

### Q3

In [11]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [12]:
base = ['ram', 'storage', 'screen']

def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [13]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [14]:
# fill in missing values with 0
df1 = df.copy()
df1['screen'].fillna(0, inplace=True)
df1_train, df1_val, df1_test = split_dataset(df1, 42)

# fill in missing values with mean from training dataset
df2 = df.copy()
df2_train, df2_val, df2_test = split_dataset(df2, 42)
df2_train['screen'].fillna(df2_train['screen'].mean(), inplace=True)
df2_val['screen'].fillna(df2_train['screen'].mean(), inplace=True)
df2_test['screen'].fillna(df2_train['screen'].mean(), inplace=True)

In [17]:
# train model with missing value filled with 0
X_train = prepare_X(df1_train)
y_train = df1_train['final_price']
w_0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df1_val)
y_val = df1_val['final_price']
y_pred = w_0 + X_val.dot(w)
score = rmse(y_val, y_pred)
print(round(score, 2))

597.36


In [18]:
# train model with missing value filled with mean of training dataset
X_train = prepare_X(df2_train)
y_train = df2_train['final_price']
w_0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df2_val)
y_val = df2_val['final_price']
y_pred = w_0 + X_val.dot(w)

score = rmse(y_val, y_pred)
print(round(score, 2))

600.27


### Q4

In [21]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [23]:
X_train = prepare_X(df1_train)
y_train = df1_train['final_price']

X_val = prepare_X(df1_val)
y_val = df1_val['final_price']

for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val.dot(w)
    
    print(r, ": ", rmse(y_val, y_pred))

0 :  597.3635593619622
0.01 :  597.3616819856012
0.1 :  597.3451592963617
1 :  597.2121215589519
5 :  597.0111186297033
10 :  597.058768066111
100 :  597.903264060304


### Q5

In [26]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

rmses = []
for seed in seeds:
    df_train, df_val, df_test = split_dataset(df1, seed)
    
    X_train = prepare_X(df_train)
    y_train = df_train['final_price']

    X_val = prepare_X(df_val)
    y_val = df_val['final_price']
    
    w_0, w = train_linear_regression(X_train, y_train)
    y_pred = w_0 + X_val.dot(w)
    rmses.append(rmse(y_val, y_pred))

print('Standard deviation: ', round(np.std(rmses), 3))

Standard deviation:  29.176


### Q6

In [30]:
df_train, df_val, df_test = split_dataset(df1, seed=9)

df_train = pd.concat([df_train, df_val])
X_train = prepare_X(df_train)
y_train = df_train['final_price']

X_test = prepare_X(df_test)
y_test = df_test['final_price']
    
w_0, w = train_linear_regression_reg(X_train, y_train, r=0.01)
y_pred = w_0 + X_test.dot(w)

print('RMSE:', np.round(rmse(y_test, y_pred), 2))

RMSE: 608.61
