In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn import metrics

In [2]:
def step_gradient(learning_rate, theta, x_sample, y_sample):

    N = float(x_sample.values[:,0].size)
    
    h = theta.dot(x_sample.transpose())
    error = h - y_sample
    
    gradients = ((x_sample.T * (error)).T)/N
    
    somatorio_louco = np.sum(gradients, axis=0)

    direction = learning_rate * somatorio_louco
    
    #theta = theta - minimization
    
    return -direction, np.linalg.norm(direction)

In [3]:
def descent(learning_rate, initial_theta, iterations, x_sample, y_sample, batch_size='full'):
    if iterations == 'stochastic':
        iterations = len(x_sample)
        batch_size = 1
    theta = initial_theta
    if batch_size == 'full':
        for i in range(iterations):
            aiseeutepego = step_gradient(learning_rate, theta, x_sample, y_sample)
            theta += aiseeutepego[0]
            if i % (iterations//50) == 0:
                print('Iteration {}: {}'.format(i, aiseeutepego[1]))
    elif type(batch_size) is int:
        i = 0
        while i < iterations:
            j = 0
            while j < len(x_sample) or j < iterations:
                aiseeutepego = step_gradient(learning_rate, theta, x_sample[j:j+batch_size], y_sample[j:j+batch_size])
                theta += aiseeutepego[0]
                if i % (iterations//50) == 0:
                    print('Iteration {}: {}'.format(i, aiseeutepego[1]))
                j += batch_size
                i += 1
    return theta

### one_hot = False, not using one-hot encoding

In [4]:
one_hot = False
data = pd.read_csv('diamonds.csv', index_col=0)

In [5]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [6]:
if one_hot:
    one_hot_cut = pd.get_dummies(data['cut'])
    one_hot_color = pd.get_dummies(data['color'])
    one_hot_clarity = pd.get_dummies(data['clarity'])
    data = data.drop('cut', axis=1)
    data = data.drop('color', axis=1)
    data = data.drop('clarity', axis=1)
    data = data.join(one_hot_cut)
    data = data.join(one_hot_color)
    data = data.join(one_hot_clarity)

In [7]:
train, test = train_test_split(data, test_size=0.15)

In [8]:
train_x = train.drop('price', axis=1)
test_x = test.drop('price', axis=1)

In [9]:
if not one_hot:
    # Removing features
    train_x = train_x.drop('cut', axis=1)
    train_x = train_x.drop('color', axis=1)
    train_x = train_x.drop('clarity', axis=1)
    test_x = test_x.drop('cut', axis=1)
    test_x = test_x.drop('color', axis=1)
    test_x = test_x.drop('clarity', axis=1)

In [10]:
train_x.head()

Unnamed: 0,carat,depth,table,x,y,z
33616,0.31,60.7,60.0,4.34,4.36,2.64
16004,1.05,62.7,56.0,6.52,6.59,4.11
11495,1.28,63.6,56.0,6.92,6.78,4.37
46529,0.51,61.9,56.0,5.14,5.1,3.17
3376,0.24,61.8,57.0,3.97,3.99,2.46


In [11]:
train_y = train.loc[:,'price']
test_y = test.loc[:,'price']

In [12]:
train_x=(train_x-train_x.min())/(train_x.max()-train_x.min())
test_x=(test_x-test_x.min())/(test_x.max()-test_x.min())

In [13]:
train_x.head()

Unnamed: 0,carat,depth,table,x,y,z
33616,0.022869,0.491667,0.326923,0.404097,0.413662,0.083019
16004,0.176715,0.547222,0.25,0.607076,0.625237,0.129245
11495,0.224532,0.572222,0.25,0.64432,0.643264,0.137421
46529,0.064449,0.525,0.25,0.478585,0.483871,0.099686
3376,0.008316,0.522222,0.269231,0.369646,0.378558,0.077358


#### descent(learning_rate, initial_theta, iterations, x_sample, y_sample, batch_size='full')

**Parameters:**
* **learning_rate: float**  
    The descent step size.
    

* **initial_theta: array-like object**  
    The coefficients (also known as $\theta$).
    

* **iterations: int or the string 'stochastic'**  
    Amount of iterations to be executed.
    

* **x_sample: array-like object**  
    Features of the training data.
    
    
* **y_sample: array-like object**  
    Target of the training data.
    
    
* **batch_size: int or the string 'full'**  
    Batch size of eatch step. If _iterations_ is set to 'stochastic', defaults to 1. 
    
**Returns:**
* **theta**:
    The

### Examples:
descent(0.1, initial_theta, 10000, x_sample, y_sample, 1000)

Will run 10 thousand iterations with batch size of 1000.

Default for batch_size is 'full'.

Iterations can be set to 'stochastic', which will force batch_size to be 1.

In [14]:
initial_theta = np.ones(train_x.values[0].size)
new_theta = descent(0.1, initial_theta, 20000, train_x, train_y, 'full')
print(new_theta)

Iteration 0: 435.4445145463196
Iteration 400: 20.675164294738526
Iteration 800: 11.592824074053489
Iteration 1200: 6.625849922085999
Iteration 1600: 3.9780320730723973
Iteration 2000: 2.6444189847131456
Iteration 2400: 2.026964085801031
Iteration 2800: 1.753141704811509
Iteration 3200: 1.6196425941588302
Iteration 3600: 1.5382919349175728
Iteration 4000: 1.4766588505116707
Iteration 4400: 1.4236031776514686
Iteration 4800: 1.375284028642285
Iteration 5200: 1.3302736659172525
Iteration 5600: 1.2879434398720973
Iteration 6000: 1.2479399296349065
Iteration 6400: 1.210014892697283
Iteration 6800: 1.173969036697573
Iteration 7200: 1.1396323714029244
Iteration 7600: 1.1068563980885675
Iteration 8000: 1.0755102237333651
Iteration 8400: 1.045478066996938
Iteration 8800: 1.0166573391828064
Iteration 9200: 0.9889570300897527
Iteration 9600: 0.9622963026472993
Iteration 10000: 0.9366032557338216
Iteration 10400: 0.911813832556914
Iteration 10800: 0.8878708581636414
Iteration 11200: 0.864723192183

In [15]:
y_train_pred = new_theta.dot(train_x.transpose())

In [16]:
y_test_pred = test_x.dot(new_theta)

In [17]:
print(train_y.values[:10], y_train_pred.values[:10])

[  462  6403  5018  1781   408 17265   811 11646  8810  2790] [ -115.38861861  6049.23443489  7644.80925742  1812.64509847
  -601.36727249 13287.19184091   670.07361271 12980.95842304
  9044.73714476  4441.85552563]


In [18]:
metrics.r2_score(train_y, y_train_pred)

0.8487489022929419

In [19]:
metrics.r2_score(test_y, y_test_pred)

0.8341743996162995

In [20]:
#np.savetxt('/home/furusato/tests/jupyter/mo444a/batch_model.txt', new_theta)