## Simple and Multiple Linear Regression 
### price of antique clocks

In [None]:
# Import packages
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

# The fundamental data type of pandas is the data frame

In [None]:
# Get the data and view it
data = pd.read_csv('clocks.csv')
data.head()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
# correlation coefficient of price with age
data['Price'].corr(data['Age'])

In [None]:
# correlation coefficient of price with bidders
data['Price'].corr(data['Bidders'])

One correlation is strong, the other is weak

In [None]:
# scatterplot for age and price
plt.scatter(data['Age'], data['Price'])
plt.show()

### Build the linear model for price as a function of age

In [None]:
# define the variables

# dependent variable (output) - y - Price
y = data['Price']

# independent variable (input) - x - Age
x = data['Age']
# to x add a column of ones
x = sm.add_constant(x)

In [None]:
y.head()

In [None]:
x.head()

In [None]:
# Create and fit the model
# NOTE: the first parameter is y, the second is x
model = sm.OLS(y,x)

In [None]:
# Get the results
results = model.fit()
print(results.summary())

The p-value for Age is below 0.05. This variable is statistically significant and can be used as a predictor.

In [None]:
# model coefficients
results.params

In [None]:
# plot the regression line
plt.scatter(data['Age'], data['Price'], color = 'blue')
plt.plot(data['Age'],results.fittedvalues, color='red')
plt.show()

In [None]:
# prediction (in sample)
yp = results.fittedvalues
print('predicted y:',yp,sep='\n')

In [None]:
# prediction (in sample) another way
yp = results.predict(x)
print('predicted y:',yp,sep='\n')

In [None]:
# price of a 120 year old clock
clock_age = 120
clock_price = results.params[0] + results.params[1]*clock_age
clock_price

In [None]:
# prediction (out of sample) - clocks 120, 150 and 200 years old
data_new = [ [120], [150], [200] ]
x_new = pd.DataFrame(data_new, columns = ['Age'])
x_new = sm.add_constant(x_new)

y_new = results.predict(x_new)
print('predicted y:',y_new,sep='\n')

### Build the linear model for price as a function of bidders

In [None]:
# scatterplot for age and bidders
plt.scatter(data['Age'], data['Bidders'])
plt.show()

In [None]:
# define the variables

# dependent variable (output) - y - Price
y = data['Price']

# independent variable (input) - x - Bidders
x = data['Bidders']
# to x add a column of ones
x = sm.add_constant(x)

In [None]:
y.head()

In [None]:
x.head()

In [None]:
# Create and fit the model
# NOTE: the first parameter is y, the second is x
model = sm.OLS(y,x)

In [None]:
# Get the results
results = model.fit()
print(results.summary())

In [None]:
# model coefficients
results.params

In [None]:
# plot the regression line
plt.scatter(data['Bidders'], data['Price'], color = 'blue')
plt.plot(data['Bidders'],results.fittedvalues, color='red')
plt.show()

In [None]:
# prediction (in sample)
yp = results.fittedvalues
print('predicted y:',yp,sep='\n')

In [None]:
# prediction (in sample) another way
yp = results.predict(x)
print('predicted y:',yp,sep='\n')

In [None]:
# price of a clock with 10 bidders
clock_bidders = 10
clock_price = results.params[0] + results.params[1]*clock_bidders
clock_price

In [None]:
# prediction (out of sample) - clocks with 10, 15 and 20 bidders
data_new = [ [10], [15], [20] ]
x_new = pd.DataFrame(data_new, columns = ['Bidders'])
x_new = sm.add_constant(x_new)

y_new = results.predict(x_new)
print('predicted y:',y_new,sep='\n')

### Build the linear model for price as a function of age and bidders

In [None]:
# define the variables

# dependent variable (output) - y - price
y = data['Price']

# independent variables (inputs) - x - age, bidders
x = data[['Age', 'Bidders']]
# to x add a column of ones
x = sm.add_constant(x)

In [None]:
y.head()

In [None]:
x.head()

In [None]:
# Create and fit the model
# NOTE: the first parameter is y, the second is x
model = sm.OLS(y,x)

In [None]:
# Get the results
results = model.fit()
print(results.summary())

#### Both independent variables are statistically significant, and the fit is much better.

In [None]:
# model coefficients
results.params

In [None]:
# prediction (in sample)
yp = results.fittedvalues
print('predicted y:',yp,sep='\n')

In [None]:
# prediction (in sample) another way
yp = results.predict(x)
print('predicted y:',yp,sep='\n')

In [None]:
# price of a 120 year-old clock with 10 bidders
clock_age = 120
clock_bidders = 10
clock_price = results.params[0] + results.params[1]*clock_age + results.params[2]*clock_bidders
clock_price

In [None]:
# prediction (out of sample) - 3 clocks 
data_new = [ [120,10], [150,15], [200,20] ]
x_new = pd.DataFrame(data_new, columns = ['Age', 'Bidders'])
x_new = sm.add_constant(x_new)

y_new = results.predict(x_new)
print('predicted y:',y_new,sep='\n')