In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations_with_replacement

In [48]:
#in_path = r'C:\Users\User\Desktop\project 1'
names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']

In [49]:
data = pd.read_csv('auto-mpg.data', sep='\s+',header=None, names = names, na_values = ['?'])

In [4]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


# 1. missing values 
To see if there exists missing values

In [5]:
data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight          float64
acceleration    float64
model_year        int64
origin            int64
car_name         object
dtype: object

In [6]:
data.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

In [7]:
data.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


### Horsepower has 6 missing values, we need to impute the missing values; Here, we use the mean values to impute

In [50]:
data.loc[data['horsepower'].isnull(), 'horsepower'] = np.mean(data['horsepower'])

In [7]:
data.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,38.199187,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,76.0,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,95.0,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,125.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [8]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


# 2. Model

In [9]:
b = np.array(data['mpg'])
A = np.array(data.iloc[:,1:8])
tA = np.transpose(A)

In [12]:
tA

array([[  8. ,   8. ,   8. , ...,   4. ,   4. ,   4. ],
       [307. , 350. , 318. , ..., 135. , 120. , 119. ],
       [130. , 165. , 150. , ...,  84. ,  79. ,  82. ],
       ...,
       [ 12. ,  11.5,  11. , ...,  11.6,  18.6,  19.4],
       [ 70. ,  70. ,  70. , ...,  82. ,  82. ,  82. ],
       [  1. ,   1. ,   1. , ...,   1. ,   1. ,   1. ]])

In [13]:
pinv = np.linalg.pinv(A)

In [14]:
pinv

array([[ 8.49152380e-03,  2.68543154e-03,  7.44645566e-03, ...,
        -4.29758479e-03, -3.43160686e-03, -3.46677123e-03],
       [-4.01852487e-05,  5.77325351e-05, -2.74705710e-05, ...,
         3.60087519e-05, -2.03209427e-05, -4.16465620e-05],
       [-9.18751307e-05,  1.81077628e-04,  1.09189323e-04, ...,
        -1.61769789e-04,  4.61072097e-05,  9.81139380e-05],
       ...,
       [-6.40268778e-04,  6.65509147e-04, -1.31884325e-04, ...,
        -3.36200576e-03,  2.97650124e-04,  7.66654533e-04],
       [-1.18754045e-05, -1.14917629e-04, -2.57535674e-05, ...,
         1.22107691e-03,  2.44651352e-04,  9.76447462e-05],
       [ 2.16531319e-04,  1.27814297e-04, -8.55036418e-04, ...,
        -5.71320296e-03, -6.20264691e-03, -6.42700110e-03]])

In [17]:
w = pinv.dot(b)

In [18]:
r = A.dot(w)-b

In [19]:
r.dot(r)/398

11.348010793029573

In [20]:
r.dot(r)**0.5/398

0.16885671420537327

## Multiple linear regression implementation

In [51]:
def MSE(w,x,y):
    r = x.dot(w)-y
    return r.dot(r)/len(x)

def linear_model(X, Y):
    pinv = np.linalg.pinv(X) # matrix persodu inverse
    w = pinv.dot(Y) # np.dot(pinv,Y) matrix multiply
    return w

## Polynomial regression implementation 

In [52]:
### get the polynomial features 
def polynomial_features(X, degree):
    n_samples, n_features = np.shape(X)
    
    def index_combinations():
        combs = [combinations_with_replacement(range(n_features), i) for i in range(0, degree + 1)]
        flat_combs = [item for sublist in combs for item in sublist]
        return flat_combs

    combinations = index_combinations()
    n_output_features = len(combinations)
    X_new = np.empty((n_samples, n_output_features))

    for i, index_combs in enumerate(combinations):  
        X_new[:, i] = np.prod(X[:, index_combs], axis=1)

    return X_new    

def polynomial_model(X, Y, degree):
    X_new = polynomial_features(X, degree)
    pinv = np.linalg.pinv(X)
    w = pinv.dot(Y)
    return w

# Training, validation and test dataset.

## Randomly split data into 60%, 20%, 20% as train data, validate data and test data.

In [53]:
def split_data(data, percent):
    idx = np.random.rand(len(data)) < percent
    train, test = data[idx], data[~idx]
    return train, test

In [54]:
train_val_data, test_data = split_data(data, 0.8)   # split data into train_val_data and test_data


In [55]:
train_data, val_data = split_data(data, 0.75)   # split data into train_val_data and test_data

## Data preparation

### Non-Standardlize data

In [56]:
train_y,train_x= np.array(train_data.iloc[:,0]), np.array(train_data.iloc[:,1:8])
val_y, val_x = np.array(val_data.iloc[:,0]), np.array(val_data.iloc[:,1:8])


### Standardlize data

In [57]:
### standardize data and run the model again
def standardlize(X):
    return (X-X.mean())/X.std()

# need only stardardlize independent variables
std_train_x= np.array(standardlize(train_data.iloc[:,1:8]))
std_val_x =  np.array(standardlize(val_data.iloc[:,1:8]))

# Multiple regression model results

### For non-standardlize data

In [58]:
#### add ones to X
ones = np.ones([train_x.shape[0],1])
train_x = np.concatenate((ones,train_x),axis=1)

ones = np.ones([val_x.shape[0],1])
val_x = np.concatenate((ones,val_x),axis=1)

In [59]:
## calculate the mean square error of traning data
w_train = linear_model(train_x, train_y)
mse_train = MSE(w_train, train_x, train_y)

In [60]:
mse_train

11.15067692402878

In [61]:
w_train

array([-2.15467603e+01, -4.34203270e-01,  1.39222858e-02, -3.48180725e-03,
       -6.67547768e-03,  1.27942191e-01,  7.97937684e-01,  1.40774763e+00])

In [62]:
# see thevalidate error
mse_val = MSE(w_train, val_x, val_y)
mse_val

10.78943102858012

### For standardlize data

In [63]:
ones = np.ones([std_train_x.shape[0],1])
std_train_x = np.concatenate((ones,std_train_x),axis=1)

ones = np.ones([std_val_x.shape[0],1])
std_val_x = np.concatenate((ones,std_val_x),axis=1)

In [34]:
## calculate the mean square error of standardize traning data
std_w_train = linear_model(std_train_x, train_y)
std_mse_train = MSE(std_w_train, std_train_x, train_y)
std_mse_train

10.535965762691205

In [35]:
# see the standardlzie validate error
std_mse_val = MSE(std_w_train, std_val_x, val_y)
std_mse_val

15.099190773953486

# Polynomial regression model results

### For non-standardlize data

In [36]:
### calculate the error for different degree
results = []
for i in range(2,5):
    x_train= polynomial_features(train_x, i)
    x_val = polynomial_features(val_x, i)
    w_train = linear_model(x_train, train_y)
    e1 = MSE(w_train, x_train, train_y)
    e2 = MSE(w_train, x_val, val_y)
    results.append((i, e1,e2))

In [37]:
df = pd.DataFrame(results, columns=['degree','Train Error', 'Validate Error']) 
df

Unnamed: 0,degree,Train Error,Validate Error
0,2,5.52407,11.294404
1,3,2.884629,57.625452
2,4,0.892822,1963.994644


### For standardlize data

In [38]:
results = []
for i in range(2,5):
    x_train= polynomial_features(std_train_x, i)
    x_val = polynomial_features(std_val_x, i)
    w_train = linear_model(x_train, train_y)
    e1 = MSE(w_train, x_train, train_y)
    e2 = MSE(w_train, x_val,val_y)
    results.append((i, e1,e2))

In [39]:
df = pd.DataFrame(results, columns=['degree','Train Error', 'Validate Error']) 
df

Unnamed: 0,degree,Train Error,Validate Error
0,2,5.52407,14.615974
1,3,2.877112,1016.854604
2,4,0.246743,65493.109777
