<a href="https://colab.research.google.com/github/edubin/DATA441-01/blob/main/Project2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zk5t1q5obaEGzV96-VjZufjtuhEnaYdf?usp=sharing)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split as tts, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from scipy.interpolate import interp1d, griddata, LinearNDInterpolator, NearestNDInterpolator
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from math import ceil
import numpy as np
from scipy import linalg

## Question 1: Lowess Function

Adapt and modify the code for Gramfort’s version of Lowess to accommodate train and test sets with multidimensional features.

In [2]:
def lowess(x, y, xnew, f=2/3, iter=3, intercept=True):
    n = len(x)
    r = int(ceil(f * n))

    if len(y.shape)==1: 
      y = y.reshape(-1,1)
    if len(x.shape)==1:
      x = x.reshape(-1,1)
    if intercept:
      x1 = np.column_stack([np.ones((len(x),1)),x])
    else:
      x1 = x

    h = [np.sort(np.sqrt(np.sum((x-x[i])**2,axis=1)))[r] for i in range(n)]
    d = np.array([np.sqrt(np.sum((x-x[i])**2,axis=1)) for i in range(len(x))])
    w = np.clip(d/h, 0.0, 1.0)
    w = (1 - w ** 3) ** 3
    yest = np.zeros(n)
    delta = np.ones(n)
    
    for iteration in range(iter):
        for i in range(n):
            weights = np.diag(w[:,i])
            b = np.transpose(x1).dot(weights).dot(y)
            A = np.transpose(x1).dot(weights).dot(x1) +  0.0001*np.eye(x1.shape[1])
            beta = linalg.solve(A, b)
            yest[i] = np.dot(x1[i],beta)

        residuals = y - yest
        s = np.median(np.abs(residuals))
        delta = np.clip(residuals / (6.0 * s), -1, 1)
        delta = (1 - delta ** 2) ** 2

    if x.shape[1]==1:
      f = interp1d(x.flatten(),yest,fill_value='extrapolate')
    else:
      f = LinearNDInterpolator(x, yest)
    output = f(xnew) 
    if sum(np.isnan(output))>0:
      g = NearestNDInterpolator(x,y.ravel()) 
      output[np.isnan(output)] = g(xnew[np.isnan(output)])
    return output

## Question #2: KFold Cross-Valudation & Real Data

Test your new function from 1) on some real data sets with k-Fold cross-validations.

###Data Set #1 - Cars Data

In [3]:
data = pd.read_csv('drive/MyDrive/Advanced Applied Machine Learning/data/cars.csv')

In [4]:
x = data.loc[:,'CYL':'WGT'].values
y = data['MPG'].values

In [5]:
scale = StandardScaler()

In [6]:
mse_lwr = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)

for idxtrain, idxtest in kf.split(x):
  xtrain = x[idxtrain]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  xtest = x[idxtest]
  xtrain = scale.fit_transform(xtrain)
  xtest = scale.transform(xtest)
  yhat_lw = lowess(xtrain, ytrain, xtest, f=1/3,iter=1,intercept=True)
 
  mse_lwr.append(mse(ytest,yhat_lw))
print('The Cross-validated Mean Squared Error for Locally Weighted Regression is : '+str(np.mean(mse_lwr)))

The Cross-validated Mean Squared Error for Locally Weighted Regression is : 16.838450874085616


### Data Set #2 - Concrete Data

In [7]:
data = pd.read_csv('drive/MyDrive/Advanced Applied Machine Learning/data/concrete.csv')

In [8]:
x = data.loc[:,'cement':'coarseagg'].values
y = data['strength'].values

In [9]:
mse_lwr = []
kf = KFold(n_splits=2,shuffle=True,random_state=1234)

for idxtrain, idxtest in kf.split(x):
  xtrain = x[idxtrain]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  xtest = x[idxtest]
  xtrain = scale.fit_transform(xtrain)
  xtest = scale.transform(xtest)
  yhat_lw = lowess(xtrain, ytrain, xtest, f=2/3,iter=1,intercept=True)
 
  mse_lwr.append(mse(ytest,yhat_lw))
print('The Cross-validated Mean Squared Error for Locally Weighted Regression is : '+str(np.mean(mse_lwr)))

The Cross-validated Mean Squared Error for Locally Weighted Regression is : 143.91236310732862
