# Optimization exercise

## Goal: Train the 2nd order polynomial predictor using both gradient descent and stochastic gradient descent. Optimize the stepsizes and compare against scikit-learn implementation

1. Download data from https://drive.google.com/file/d/0Bz9_0VdXvv9bUUNlUTVrMF9VcVU/view?usp=sharing.
2. Create a function psi(x), which transforms features AST (assists), REB (rebounds) and STL (steals) into 2nd order polynomial features (add each feature squared and each pair of features multiplied with every other)
3. Create a transformed data matrix X, where each x is mapped to psi(x).
4. Create a function p2(x,w), which outputs the value of the polynomial at x for given parameters w.
5. Create a function Loss(X,y,w), which computes the squared loss of predicting y from X by p2(x,w) using parameters w. Take variable PTS as y. We will predict scored points based on assists, rebounds and steals.
6. Code up the gradient descent. It should input a point w and a stepsize.
7. Choose an arbitrary point and stepsize. Run gradient descent for 100 iterations and compute the Loss after each iteration. How does the loss behave? Does it converge to something?
8. Can you find the stepsize, for which the loss is smallest after 100 iterations?

In [348]:
# IMPORT PACKAGES
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 
import random
from matplotlib.pylab import rcParams
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
import copy
%matplotlib inline

# 1)

In [389]:
nb = pd.read_csv('nba_games_2013_2015.csv', delimiter=';')
x = nb[['AST','REB','STL']]
y = nb['PTS']

In [421]:
x.head()

Unnamed: 0,AST,REB,STL
0,41,43,14
1,23,43,8
2,20,39,7
3,19,47,6
4,21,43,4


# 2)

In [350]:
def psi(x):
    X = copy.deepcopy(x)
    length = len(X.columns)
    for i in range(len(X.columns)):
        X[f'{X.columns[i]}_squared'] = X[X.columns[i]]**2
        for j in range(i+1,length):
            X[f'{X.columns[i]}_{X.columns[j]}'] = X[X.columns[i]]*X[X.columns[j]]
    return(X)

# 3)

In [351]:
X = psi(x)
X.head()

Unnamed: 0,AST,REB,STL,AST_squared,AST_REB,AST_STL,REB_squared,REB_STL,STL_squared
0,41,43,14,1681,1763,574,1849,602,196
1,23,43,8,529,989,184,1849,344,64
2,20,39,7,400,780,140,1521,273,49
3,19,47,6,361,893,114,2209,282,36
4,21,43,4,441,903,84,1849,172,16


# 4)

In [398]:
def p2(x,params):
    y_pred = np.zeros(x.shape[0])
    for i in range(len(x.columns)):
        y_pred = y_pred + x[x.columns[i]]*params[i]
    return(y_pred)

In [457]:
theta

array([0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [458]:
y_result = p2(X,theta)
y_result

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
7375    0.0
7376    0.0
7377    0.0
7378    0.0
7379    0.0
Length: 7380, dtype: float64

# 5)

In [431]:
def p2(x,params):
    y_pred = np.zeros(x.shape[0])
    for i in range(len(x.columns)):
        y_pred = y_pred + x[x.columns[i]]*params[i]
    return(y_pred)

In [435]:
X.head()

Unnamed: 0,AST,REB,STL,AST_squared,AST_REB,AST_STL,REB_squared,REB_STL,STL_squared
0,41,43,14,1681,1763,574,1849,602,196
1,23,43,8,529,989,184,1849,344,64
2,20,39,7,400,780,140,1521,273,49
3,19,47,6,361,893,114,2209,282,36
4,21,43,4,441,903,84,1849,172,16


In [434]:
y_result = p2(X,theta)
y_result

0       6763.0
1       4033.0
2       3229.0
3       3967.0
4       3533.0
         ...  
7375    3199.0
7376    4152.0
7377    5176.0
7378    4053.0
7379    3298.0
Length: 7380, dtype: float64

In [432]:
def Loss(X,y,w):
    y_res = p2(X,w)
    err = (y - y_res)**2
    SSL = sum(err)
    return(SSL)

In [433]:
Loss(X,y,theta)

120827940125.0

# 6)

In [467]:
alpha = 0.00000001 #learning rate
iterations = 100 #No. of iterations
m = y.size #No. of data points
np.random.seed(0) #Set the seed
theta = np.zeros(9) #Pick some random values to start with - we a

In [469]:
def grad_descent(X_mat, y, alpha, w, max_iters):
    
    df = copy.deepcopy(X_mat)
    m = len(y)
    
    loss_func_history = []
    loss_func_history.append(Loss(X,y,w))
    
    for k in range(max_iters):
        predictions = p2(x, w)
        w_size = len(w)
        
        for i in range(w_size):
            temp = df.iloc[:, i]
            errors_x1 = (predictions - y) * temp
            w[i] = w[i] - alpha * (1.0 / m) * errors_x1.sum()
            
        loss_func_history.append(Loss(X,y,w))
        
    return loss_func_history

In [459]:
def gradient(df, y, theta, iterations, alpha):
    
    x = copy.deepcopy(df)
    n = len(y)
    
    my_loss = [Loss(X,y,theta)]
    my_theta = [theta]
    cur_theta = theta
    
    for i in range(iterations):
        
        predictions = p2(x,theta)
        error = y - predictions
        my_loss.append(Loss(x,y,theta))
        
        theta = theta - ( (alpha/n) * np.dot(x.T, error))
        
        my_theta.append(theta)
        
    return(my_loss, my_theta)
        

# 7) does not converge....

# 8) the initial random theta yields lowest loss....

In [455]:
poly = PolynomialFeatures(2)
X_test = pd.DataFrame(poly.fit_transform(x))

In [456]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,41.0,43.0,14.0,1681.0,1763.0,574.0,1849.0,602.0,196.0
1,1.0,23.0,43.0,8.0,529.0,989.0,184.0,1849.0,344.0,64.0
2,1.0,20.0,39.0,7.0,400.0,780.0,140.0,1521.0,273.0,49.0
3,1.0,19.0,47.0,6.0,361.0,893.0,114.0,2209.0,282.0,36.0
4,1.0,21.0,43.0,4.0,441.0,903.0,84.0,1849.0,172.0,16.0
...,...,...,...,...,...,...,...,...,...,...
7375,1.0,17.0,39.0,10.0,289.0,663.0,170.0,1521.0,390.0,100.0
7376,1.0,26.0,40.0,10.0,676.0,1040.0,260.0,1600.0,400.0,100.0
7377,1.0,23.0,52.0,8.0,529.0,1196.0,184.0,2704.0,416.0,64.0
7378,1.0,23.0,41.0,11.0,529.0,943.0,253.0,1681.0,451.0,121.0
