# Linear Regression

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly 
import plotly.graph_objs as go
plotly.tools.set_credentials_file(username='gaston.farina', api_key='qp2zUcv3i5fIvwcecMhb')
import pandas as pd
%matplotlib inline

In [2]:
Data = pd.read_csv("data_1d.csv",header=None)
Data.columns = ['X','Y']
Data.head()

Unnamed: 0,X,Y
0,95.724162,197.179636
1,35.757619,67.59067
2,28.816847,60.854133
3,99.958481,196.907397
4,66.809748,125.311129


In [3]:
X = Data['X']
Y = Data['Y']

The linear regression model is trying to fit the closest straight line to the points

The formula is derived as follows

![Title](LinearRegression1D.png)



In [4]:
beta_0 = (np.mean(Y)*np.dot(X,X) - np.mean(X)*np.dot(X,Y))/(np.dot(X,X)-np.mean(X)*np.sum(X))
beta_1 = (np.dot(X,Y)-np.mean(Y)*np.sum(X))/(np.dot(X,X)-np.mean(X)*np.sum(X))
Line = beta_0+beta_1*X


print("Beta_1 is : " + str(beta_1))
print("Beta_0 is : " + str(beta_0))

Beta_1 is : 1.97261216748
Beta_0 is : 2.86442407566


The next Graph show us that the variable X is correlated with the variable Y

In [5]:
X = X
Y = Y
trace = go.Scatter(
    x = X,
    y = Y,
    mode = 'markers',
    name = 'Points'
    
)
trace1 = go.Scatter(
    x = X,
    y = Line,
    mode = 'lines',
    name = 'Linear Model'
)
layout = dict(title = 'Linear Model',
              xaxis=dict(
                   title='X'
             ),
                yaxis=dict(
                   title='Y'
             )
  )
              

data = [trace,trace1]
fig = dict(data=data, layout=layout)
py.iplot(fig,filename='Scatter')


### The Error Function

In [6]:
def error(beta0,beta1,X,Y):
    er = np.dot((Y-beta1*X-beta0),(Y-beta1*X-beta0))
    return er

ErrOptimal = error(2.86442407566,1.97261216748,X,Y)
ErrNotOptimal1 = error(0,0,X,Y)
print(ErrOptimal)
print(ErrNotOptimal1)


2479.60991461
1436603.49314


### R squared

R squared answer the question How good is your model?

![Title](Rsquared.png)

In [7]:
def Rsquared(y,yhat):
    SSR = np.dot(y-yhat,y-yhat)
    SST = np.dot(y-np.mean(y),y-np.mean(y))
    Rsq = 1-SSR/SST
    return Rsq
Rs = Rsquared(Y,Line)
print("The Rsquared is : " + str(Rs))

The Rsquared is : 0.991183820298


## Mutiple Linear Regression 

A multiple linear regression is a regression where the Y dependent variable depends on more one independient variable

In [8]:
MultipleData = pd.read_csv("data_2d.csv",header=None)
MultipleData.columns = ['X','Y','Z']
MultipleData.head()

Unnamed: 0,X,Y,Z
0,17.930201,94.520592,320.25953
1,97.144697,69.593282,404.634472
2,81.775901,5.737648,181.485108
3,55.854342,70.325902,321.773638
4,49.36655,75.11404,322.465486


In [9]:
MultipleData['Bias'] = 1
X = MultipleData[['X','Y','Bias']].as_matrix()
Y = MultipleData['Z'].as_matrix()
Y = Y.reshape((Y.shape[0], 1))
print("The Matrix X has the shape: "+str(X.shape))
print("The Matrix Y has the shape: "+str(Y.shape))

The Matrix X has the shape: (100, 3)
The Matrix Y has the shape: (100, 1)


In [10]:
MultipleBetas = np.linalg.solve(np.dot(np.transpose(X),X),np.dot(np.transpose(X),Y))
for i in reversed(range(MultipleBetas.shape[0])):
    print("Beta_"+str(i)+" = "+ str(MultipleBetas[i][0]))

Beta_2 = 1.46191241456
Beta_1 = 2.96985048347
Beta_0 = 2.01666793359


In [11]:
Ypred = np.dot(X,MultipleBetas)

In [12]:
Xd = X[:,0]
Yd = X[:,1]
Zd = Y

s = np.linspace(0,100, 240)
t = np.linspace(0,100, 240)
tGrid, sGrid = np.meshgrid(s, t)
trace2 = go.Surface(
    x = sGrid,
    y = tGrid,
    z = sGrid*MultipleBetas[0][0]+tGrid*MultipleBetas[1][0]+MultipleBetas[2][0]
    
)


trace1 = go.Scatter3d(
    x=Xd,
    y=Yd,
    z=Zd,
    mode='markers',
    marker=dict(
        size=5,
        line=dict(
            color='rgba(217, 217, 217, 0.14)',
            width=0.2
        ),
        opacity=1.0
    )
)
data = [trace1,trace2]
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='simple-3d-scatter')

In [13]:
Y = Y.flatten()
Ypred = Ypred.flatten()
Rs = Rsquared(Y,Ypred)
print("The R squared: " + str(Rs))


The R squared: 0.998004061248


## Polinomial Regression

In [14]:
PoliData = pd.read_csv("data_poly.csv",header=None)
PoliData.columns = ['X','Y']
#PoliData = PoliData.sort(['X'])
PoliData['X*X'] = PoliData['X']*PoliData['X']
PoliData['Bias'] = 1
PoliData = PoliData[['Bias','X','X*X','Y']]
X = PoliData[['Bias','X','X*X']].as_matrix()
Y = PoliData[['Y']].as_matrix()

In [15]:
X1 = X[:,1]
Y1 = Y
trace = go.Scatter(
    x = X1,
    y = Y1,
    mode = 'markers',
    name = 'Points'
    
)
layout = dict(title = 'Linear Model',
              xaxis=dict(
                   title='X'
             ),
                yaxis=dict(
                   title='Y'
             )
  )
              

data = [trace]
fig = dict(data=data, layout=layout)
py.iplot(fig,filename='Scatter')


In [16]:
PolyBetas = np.linalg.solve(np.dot(np.transpose(X),X),np.dot(np.transpose(X),Y))
for i in range(PolyBetas.shape[0]):
    print("Beta_"+str(i)+" = "+ str(PolyBetas[i][0]))

Beta_0 = 4.27775141115
Beta_1 = 1.00542875357
Beta_2 = 0.0996172284129


In [17]:
YpredPol = np.dot(X,PolyBetas)

In [18]:
Xp = X[:,1]
Yp = X[:,2]
Zp = Y
s = np.linspace(0,100,240)
t = np.linspace(0,100,240)
tGrid,sGrid = np.meshgrid(t,s)

trace2 = go.Surface(
    x = tGrid,
    y = sGrid*sGrid,
    z = tGrid*PolyBetas[1][0]+sGrid*sGrid*PolyBetas[2][0]+PolyBetas[0][0]
    
)
trace1 = go.Scatter3d(
    x=Xp,
    y=Yp,
    z=Zp,
    mode='markers',
    marker=dict(
        size=5,
        line=dict(
            color='rgba(217, 217, 217, 0.14)',
            width=0.2
        ),
        opacity=1.0
    )
)
data = [trace2,trace1]
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='simple-3d-scatter')

In [19]:
Y = Y.flatten()
YpredPol = YpredPol.flatten()
Rs = Rsquared(Y,YpredPol)
print("The R squared: " + str(Rs))

The R squared: 0.999141229637


# Some issues in Machine Learning with linear regression

## Overfitting

Overfitting is when the model is fitted very well to the data but the model don't generalize very well

In [278]:
N = 100
X = np.linspace(0,6*np.pi,N)
Y = np.cos(X)


In [279]:
X1 = X
Y1 = Y
trace = go.Scatter(
    x = X1,
    y = Y1,
    mode = 'markers',
    name = 'Points'
    
)
layout = dict(title = 'Linear Model',
              xaxis=dict(
                   title='X'
             ),
                yaxis=dict(
                   title='Y'
             )
  )
              

data = [trace]
fig = dict(data=data, layout=layout)
py.iplot(fig,filename='Scatter')

PlotlyRequestError: Hi there, you've reached the threshold of 100 combined image exports and chart saves per day. If you need to raise your daily limit, consider upgrading to a Student or Personal Plan (see: https://plot.ly/products/cloud).

In [280]:
def MakePoly(X,deg):
    n = len(X)
    data = [np.ones(n)]
    for d in range(deg):
        data.append(X**(d+1))
    return np.vstack(data).T


def GetSomeTrainData(X,Y,sample):
    N = len(X)
    train_idx = np.random.choice(N,sample)
    Xtrain = X[train_idx]
    Ytrain = Y[train_idx]
    return Xtrain,Ytrain

def fit(X,Y):
    return np.linalg.solve(X.T.dot(X),X.T.dot(Y))

def get_mse(Y,Yhat):
    d = Y-Yhat
    return d.dot(d)/len(d)

degree = 10


Xtrain,Ytrain = GetSomeTrainData(X,Y,12)
for i in range(15):
   Xtrain_poly = MakePoly(Xtrain,i)
   w = fit(Xtrain_poly,Ytrain)
   X_poly = MakePoly(X,i)
   Y_hat = X_poly.dot(w)
   Error = get_mse(Y,Y_hat)
   print("The Error with degree: "+str(i)+ " is "+ str(Error))
  
X1 = Xtrain
Y1 = Ytrain
trace = go.Scatter(
    x = X1,
    y = Y1,
    mode = 'markers',
    name = 'Points',
    marker=dict(
        size='20',
        color = np.random.randn(500)
    )
    
)
trace1 = go.Scatter(
    x = X,
    y = Y_hat,
    mode = 'lines',
    name = 'Linear Model'
)
trace2 = go.Scatter(
    x = X,
    y = Y,
    mode = 'markers',
    name = 'Points'
    
)
layout = dict(title = 'Linear Model',
              xaxis=dict(
                   title='X'
             ),
                yaxis=dict(
                   title='Y'
             )
  )
              

data = [trace,trace1,trace2]
fig = dict(data=data, layout=layout)
py.iplot(fig,filename='Scatter')


The Error with degree: 0 is 0.506699383724
The Error with degree: 1 is 0.506894434416
The Error with degree: 2 is 0.62634909781
The Error with degree: 3 is 0.649388136299
The Error with degree: 4 is 0.616969519065
The Error with degree: 5 is 0.938803309482
The Error with degree: 6 is 0.713539513422
The Error with degree: 7 is 0.749591215938
The Error with degree: 8 is 0.252316875249
The Error with degree: 9 is 0.846741873335
The Error with degree: 10 is 0.321830350463
The Error with degree: 11 is 0.0866757052221
The Error with degree: 12 is 0.0678351181738
The Error with degree: 13 is 0.00692680931837
The Error with degree: 14 is 0.136388922001


PlotlyRequestError: Hi there, you've reached the threshold of 100 combined image exports and chart saves per day. If you need to raise your daily limit, consider upgrading to a Student or Personal Plan (see: https://plot.ly/products/cloud).

In [281]:
def PlotTrainTestCurves(X,Y,sample=20,max_degree=20):
    N = len(X)
    train_idx = np.random.choice(N,sample)
    Xtrain = X[train_idx]
    Ytrain = Y[train_idx]
    test_idx = [idx for idx in range(N) if idx not in train_idx]
    Xtest = X[test_idx]
    Ytest = Y[test_idx]
    mse_trains = []
    mse_tests = []
    for d in range(max_degree):
        Xtrain_poly = MakePoly(Xtrain,(d+1))
        w = fit(Xtrain_poly,Ytrain)
        Yhat_train = Xtrain_poly.dot(w)
        mse_train = get_mse(Ytrain,Yhat_train)
        
        Xtest_poly = MakePoly(Xtest,(d+1))
        Yhat_test = Xtest_poly.dot(w)
        mse_test = get_mse(Ytest,Yhat_test)
        
        mse_trains.append(mse_train)
        mse_tests.append(mse_test)
    ran = np.arange(max_degree)
    return mse_trains,mse_tests,ran

ErrorTrain,ErrorTest,maxdeg = PlotTrainTestCurves(X,Y,sample=20,max_degree=14)
print()

Train = np.array(ErrorTrain)
Test = np.array(ErrorTest)

print(Train)
print(Test)
print(maxdeg)
trace1 = go.Scatter(
    x = maxdeg,
    y = Train,
    mode = 'lines',
    name = 'Train Error'
)
trace2 = go.Scatter(
    x = maxdeg,
    y = Test,
    mode = 'lines',
    name = 'Test Error'
)
layout = dict(title = 'Linear Model',
              xaxis=dict(
                   title='X'
             ),
                yaxis=dict(
                   title='Y'
             )
  )
              

data = [trace1,trace2]
fig = dict(data=data, layout=layout)
py.iplot(fig,filename='Train and Test Error')
    
    
    


[  4.42676571e-01   4.40851355e-01   4.40850520e-01   4.25369020e-01
   3.65284286e-01   1.94381424e-01   9.60685516e-02   8.08673670e-03
   4.63525794e-03   9.40666523e-05   4.98062913e-05   1.13049525e-04
   3.53544630e-05   3.70299181e-05]
[  0.53829338   0.52817915   0.52805983   0.42466106   1.00726178
   2.35840321  10.97533478   4.17719092  16.19374357   1.51640728
  13.79145643  29.76710691   6.59986762   5.24717849]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13]


PlotlyRequestError: Hi there, you've reached the threshold of 100 combined image exports and chart saves per day. If you need to raise your daily limit, consider upgrading to a Student or Personal Plan (see: https://plot.ly/products/cloud).

## Categorical Inputs

In [258]:
### talking about one hot encoder