In [1]:
import pandas as pd

# Load data
data_df = pd.read_csv('marketing-campaign.csv')
print('data_df shape:', data_df.shape) # (50, 4)
data_df.head()

data_df shape: (50, 4)


Unnamed: 0,tv,web,radio,sales
0,0.916,1.689,0.208,1.204
1,9.359,1.706,1.071,4.8
2,5.261,2.538,2.438,3.97
3,8.682,2.092,1.283,5.212
4,11.736,1.66,1.8,5.993


In [2]:
X = data_df.drop('sales', axis=1).values
print('X:', X.shape) # Prints: (50, 3)

# Extract target vector y
y = data_df.sales.values
print('y:', y.shape) # Prints: (50,)

X: (50, 3)
y: (50,)


In [3]:
from scipy.linalg import lstsq

# Fit a multiple linear regression
w, rss, _, _ = lstsq(X, y)
print('w:', w) # Prints: [ 0.3958359   0.47521518  0.31040001]
print('RSS:', rss) # Prints: 1.6884039033

w: [ 0.3958359   0.47521518  0.31040001]
RSS: 1.6884039033


In [4]:
import numpy as np

# Add a column of ones
X1 = np.c_[
    np.ones(X.shape[0]), # Vector of ones of shape (n,)
    X # X matrix of shape (n,p)
]

X1[:5, :]

array([[  1.   ,   0.916,   1.689,   0.208],
       [  1.   ,   9.359,   1.706,   1.071],
       [  1.   ,   5.261,   2.538,   2.438],
       [  1.   ,   8.682,   2.092,   1.283],
       [  1.   ,  11.736,   1.66 ,   1.8  ]])

In [5]:
w, rss, _, _ = lstsq(X1, y)

print('w:', w) # Prints: [ 0.02487092  0.39465146  0.47037002  0.30669954]
print('RSS:', rss) # 1.68545086808

w: [ 0.02487092  0.39465146  0.47037002  0.30669954]
RSS: 1.68545086808


In [6]:
y_pred = np.matmul(X1, w)
print('y_pred:', y_pred.shape) # Prints: (50,)

y_pred: (50,)


In [7]:
def RSS(y, y_pred):
    return np.sum(np.square(np.subtract(y, y_pred)))

rss = RSS(y, y_pred)
print('RSS:', rss)

RSS: 1.68545086808


## R Squared

In [8]:
import pandas as pd

# Load data
data_df = pd.read_csv('marketing-campaign.csv')
X = data_df.drop('sales', axis=1).values
y = data_df.sales.values

In [9]:
import numpy as np

# Define RSS measure
def RSS(y, y_pred):
    return np.sum(np.square(np.subtract(y, y_pred)))

# RSS of the baseline
rss_baseline = RSS(y, y.mean())
print('RSS baseline:', rss_baseline)

RSS baseline: 100.86060792


In [10]:
from scipy.linalg import lstsq

# Fit a multiple linear regression
X1 = np.c_[np.ones(X.shape[0]), X]
w, model_rss, _, _ = lstsq(X1, y)
print('RSS:', model_rss)

RSS: 1.68545086808


In [11]:
R2 = 1 - (model_rss / rss_baseline)
print('R^2 coefficient:', R2)

R^2 coefficient: 0.983289304885


In [12]:
R2 = 1 - (15.74 / rss_baseline)
print('R^2 coefficient:', R2)

R^2 coefficient: 0.84394303857


## Linear Regression

In [13]:
import pandas as pd

# Load data
data_df = pd.read_csv('marketing-campaign.csv')
X = data_df.drop('sales', axis=1).values
y = data_df.sales.values

In [14]:
import numpy as np

# Create X1 matrix
X1 = np.c_[
    np.ones(X.shape[0]), # Column of ones, shape: (n,)
    X # Input matrix, shape: (n,p)
]

In [15]:
XX = np.matmul(X1.T, X1)
Xy = np.matmul(X1.T, y)
w = np.matmul(np.linalg.inv(XX), Xy)
w

array([ 0.02487092,  0.39465146,  0.47037002,  0.30669954])

In [17]:

# Verify with Scipy lstsq
w, _, _, _ = lstsq(X1, y)
w

array([ 0.02487092,  0.39465146,  0.47037002,  0.30669954])

## SkLearn

In [18]:
import pandas as pd

# Load data
data_df = pd.read_csv('marketing-campaign.csv')
X = data_df.drop('sales', axis=1).values
y = data_df.sales.values

In [19]:
from sklearn.linear_model import LinearRegression

# Create a linear regression object
lr = LinearRegression()

# Fit the model
lr.fit(X, y)

# Print coefficients
print('Coefficients:', lr.coef_)
# Prints: [ 0.39465146  0.47037002  0.30669954]

print('Intercept:', lr.intercept_)
# Prints: 0.0248709178882

Coefficients: [ 0.39465146  0.47037002  0.30669954]
Intercept: 0.0248709178882


In [20]:
y_pred = lr.predict(X)
y_pred[:3]

array([ 1.24462012,  4.84934038,  4.04266482])

In [21]:
y_pred = np.matmul(X, lr.coef_) + lr.intercept_
y_pred[:3]

array([ 1.24462012,  4.84934038,  4.04266482])

In [22]:
R2 = lr.score(X, y)
R2

0.98328930488482358

In [23]:
from sklearn.linear_model import SGDRegressor

# Create the SGDRegressor object
lr_sgd = SGDRegressor(
    loss='squared_loss', # Cost function
    penalty='none', # Add a penalty term?
    max_iter=1000, # Number of iterations
    random_state=0 # The implementation shuffles the data
)

# Fit the linear regression model
lr_sgd.fit(X, y)

# Print coefficients
print('Coefficients:', lr_sgd.coef_)
# Prints: [ 0.3938607   0.46968115  0.30563938]

print('Intercept:', lr_sgd.intercept_)

Coefficients: [ 0.3938607   0.46968115  0.30563938]
Intercept: [ 0.02885412]


In [24]:
R2_sgd = lr_sgd.score(X, y)
R2_sgd

0.98327806598352852

In [25]:
from sklearn.linear_model import HuberRegressor

# Create the estimator
huber = HuberRegressor(epsilon=1.35)

# Fit it to X,y
huber.fit(X, y)

print('Coefficients:', huber.coef_)
# Prints: [ 0.39172544  0.4788203   0.29315421]

print('Intercept:', huber.intercept_)
# Prints: 0.0458629881919

print('R^2 coefficient:', huber.score(X, y))

Coefficients: [ 0.39172544  0.4788203   0.29315421]
Intercept: 0.0458629881919
R^2 coefficient: 0.983070157114


## Colinearity

In [27]:
import pandas as pd

# Load data
data_df = pd.read_csv('bike-sharing-simple.csv')

# Create Numpy arrays
temp = data_df.temp.values
users = data_df.users.values

# First five rows
data_df.head()

Unnamed: 0,temp,users
0,0.1964,120
1,0.2,108
2,0.227,82
3,0.2043,88
4,0.1508,41


In [28]:
temp_C = 47*temp - 8

In [29]:
import numpy as np

# Create input matrix X
X = np.c_[temp, temp_C]

# Add a column of ones
X1 = np.c_[np.ones(X.shape[0]), X]

# Compute rank
rank = np.linalg.matrix_rank(X1)
print('Rank', rank) 

Rank 2


In [30]:
X1

array([[  1.    ,   0.1964,   1.2308],
       [  1.    ,   0.2   ,   1.4   ],
       [  1.    ,   0.227 ,   2.669 ],
       [  1.    ,   0.2043,   1.6021],
       [  1.    ,   0.1508,  -0.9124],
       [  1.    ,   0.1727,   0.1169],
       [  1.    ,   0.165 ,  -0.245 ],
       [  1.    ,   0.1609,  -0.4377],
       [  1.    ,   0.1775,   0.3425],
       [  1.    ,   0.0974,  -3.4222],
       [  1.    ,   0.195 ,   1.165 ],
       [  1.    ,   0.187 ,   0.789 ],
       [  1.    ,   0.2717,   4.7699],
       [  1.    ,   0.2208,   2.3776],
       [  1.    ,   0.1443,  -1.2179],
       [  1.    ,   0.1891,   0.8877],
       [  1.    ,   0.415 ,  11.505 ],
       [  1.    ,   0.2661,   4.5067],
       [  1.    ,   0.3183,   6.9601],
       [  1.    ,   0.4358,  12.4826],
       [  1.    ,   0.5217,  16.5199],
       [  1.    ,   0.1822,   0.5634],
       [  1.    ,   0.2217,   2.4199],
       [  1.    ,   0.2667,   4.5349],
       [  1.    ,   0.335 ,   7.745 ],
       [  1.    ,   0.198

In [34]:
from scipy.linalg import lstsq

# Compute OLS using lstsq
w, rss, rank, sv = lstsq(X1, users)
sv

array([  3.26256503e+02,   8.35565257e+00,   1.99632569e-15])

In [35]:
from sklearn.metrics import r2_score

# R^2 coefficient of simple linear regression
coefs = np.polyfit(temp, users, deg=1)
y_pred_normal = np.polyval(coefs, temp)
r2_normal = r2_score(users, y_pred_normal)
print('R^2 normal:', r2_normal)
# Prints: 0.595423308019

# R^2 coefficient with collinear features
y_pred_collinear = np.matmul(X1, w)
r2_collinear = r2_score(users, y_pred_collinear)
print('R^2 collinear:', r2_collinear)

R^2 normal: 0.595423308019
R^2 collinear: 0.595423308019


In [36]:
temp_F = 1.8*temp_C + 32

In [43]:
noise = np.random.normal(loc=0, scale=0.01, size=temp_F.shape)
temp_F += noise

# Create input matrix X
X = np.c_[temp_C, temp_F]

# Compute OLS using lstsq
X1 = np.c_[np.ones(X.shape[0]), X] # Create X1 matrix
w, rss, rank, _ = lstsq(X1, users) # OLS

print('rank:', rank) # Prints: 3
print('RMSE:', np.sqrt(rss/len(users))) # Depends on the noise value
print('w:', w) 

rank: 3
RMSE: 231.589457266
w: [-40008.15152619  -2228.50820646   1255.50479871]


In [44]:
cn = np.linalg.cond(X1)
cn

92845.029430690964

## Regularization

In [48]:
from sklearn.linear_model import Ridge

# Add small variations
noise = np.random.normal(loc=0, scale=0.01, size=temp_F.shape)
temp_F = (1.8*temp_C + 32) + noise

# Create input matrix X
X = np.c_[temp_C, temp_F]

# Fit a Ridge regression
ridge = Ridge(alpha=100)
ridge.fit(X, users)

print('Coefficients:', ridge.coef_)
print('Intercept:', ridge.intercept_)
print('R^2:', ridge.score(X, users))

Coefficients: [  7.54549     13.46940862]
Intercept: -270.538735591
R^2: 0.595415425401


In [32]:
rss

array([], dtype=float64)

In [33]:
rank

2