### Multiple Linear Regression - income example with categorical data

In [None]:
# Import packages
import pandas as pd
import statsmodels.api as sm

# The fundamental data type of pandas is the data frame

In [None]:
# Get the data - known inputs and outputs
data = pd.read_csv('Income_data_categ.csv')
data.head()

In [None]:
# define Residence as categorical
data['Residence'] = data['Residence'].astype('category')

In [None]:
# define the variables

# dependent variable (output) - y - income
y = data['Income']

# continuous and binary independent variables (inputs) - x - age, education, smoker
x = data[['Age', 'Education', 'Smoker']]
# to x add a column of ones
x = sm.add_constant(x)

In [None]:
y.head()

In [None]:
x.head()

In [None]:
# create a set of dummy variables for the categorical variable Residence
d_Residence = pd.get_dummies(data['Residence'])
d_Residence.head(20)

In [None]:
# we need one less dummy variable than the number of levels of the categorical variables,
# otherwise we will introduce multicollinearity.
d_Residence.drop(columns='East',inplace=True)
d_Residence.head(20)

In [None]:
# add the dummy variables to x
x = pd.concat([x, d_Residence[['North','South','West']]], axis=1)
x.head()

In [None]:
# Create and fit the model
# NOTE: the first parameter is y, the second is x
model = sm.OLS(y,x)

In [None]:
# Get the results
results = model.fit()
print(results.summary())

The p-value for Smoker is 0.17 > 0.05. This variable is not significant statistically and is a bad predictor.
It has to be removed.

In [None]:
# remove Smoker
x.drop('Smoker',axis=1,inplace=True)
x.head()

In [None]:
# rebuild the model
model = sm.OLS(y,x)

In [None]:
# Get the results
results = model.fit()
print(results.summary())

In [None]:
# prediction (in sample)
yp = results.fittedvalues
print('predicted y:',yp,sep='\n')

In [None]:
# prediction (in sample) another way
yp = results.predict(x)
print('predicted y:',yp,sep='\n')

In [None]:
# prediction (out of sample)
data_new = [ [25, 10, 1, 0, 0], [35, 15, 0, 1, 0], [50, 10, 0, 0, 0] ]
x_new = pd.DataFrame(data_new, columns = ['Age','Education', 'North', 'South', 'West' ])
x_new = sm.add_constant(x_new)

y_new = results.predict(x_new)
print('predicted y:',y_new,sep='\n')

In [None]:
# What income would the model predict for a newborn (Age = 0 and Education = 0) ?
data_new = [ [0, 0, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 0] ]
x_new = pd.DataFrame(data_new, columns = ['Age','Education', 'North', 'South', 'West' ])
x_new = sm.add_constant(x_new)

y_new = results.predict(x_new)
print('predicted y:',y_new,sep='\n')

In [None]:
# Surely this can't be right ! Or can it ?
# Let's check the model
print(results.summary())

In [None]:
# The intercept and all coefficients are positive.
# The model will always predict positive values.
# We can "force" the model to have intercept = 0, i.e. to pass
# through the origin of the coordinate system.
# This is our model input now:
x.head()

In [None]:
# To have intercept zero we have to remove the first (const) column:
x.drop('const',axis=1,inplace=True)
x.head()

In [None]:
# build the no-intercept model
model_0 = sm.OLS(y,x)

In [None]:
# Get the results
results_0 = model_0.fit()
print(results_0.summary())

In [None]:
# However, this is not the right thing to do.
# The data we have used to train the model is for people of ages between
x['Age'].min(), x['Age'].max()

In [None]:
# ... and education years between
x['Education'].min(), x['Education'].max()

In [None]:
# Age = 0 and Education = 0 are far away from the data we used to 
# train the model.
# A predictive model can be relied upon only when the input is 
# within the range of the training data. It has not "learnt" what 
# happens outside this range.