### Multiple Linear Regression - TRAIN / TEST - income example with categorical data

In [None]:
# Import packages
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# The fundamental data type of pandas is the data frame

In [None]:
# Get the data - known inputs and outputs
data = pd.read_csv('Income_data_categ.csv')
data.head()

In [None]:
# define Residence as categorical
data['Residence'] = data['Residence'].astype('category')

In [None]:
# define the variables

# dependent variable (output) - y - income
y = data['Income']

# continuous and binary independent variables (inputs) - x - age, education, smoker
x = data[['Age', 'Education', 'Smoker']]
# to x add a column of ones
x = sm.add_constant(x)

In [None]:
y.head()

In [None]:
x.head()

In [None]:
# create a set of dummy variables for the categorical variable Residence
d_Residence = pd.get_dummies(data['Residence'])
d_Residence.head(20)

In [None]:
# we need one less dummy variable than the number of levels of the categorical variables,
# otherwise we will introduce multicollinearity.
d_Residence.drop(columns='East',inplace=True)
d_Residence.head(20)

In [None]:
# add the dummy variables to x
x = pd.concat([x, d_Residence], axis=1)
x.head()

In [None]:
# split into training and testing (hold out) set
train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=0.8, random_state=123)

In [None]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

In [None]:
# Create and fit the model using the training set - train_x and train_y
# NOTE: the first parameter is y, the second is x
model = sm.OLS(train_y,train_x)

In [None]:
# Get the results
results = model.fit()
print(results.summary())

The p-value for Smoker is 0.094 > 0.05. This variable is not significant statistically and is a bad predictor.
It has to be removed.

In [None]:
# remove Smoker from the data
x.drop('Smoker',axis=1,inplace=True)

In [None]:
# split into training and testing (hold out) set
train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=0.8, random_state=123)
train_x.shape, train_y.shape, test_x.shape, test_y.shape

In [None]:
# rebuild the model
model = sm.OLS(train_y,train_x)

In [None]:
# Get the results
results = model.fit()
print(results.summary())

In [None]:
# prediction using the test data
test_y_pred = results.predict(test_x)

In [None]:
# Mean Absolute Percentage Error (MAPE)
MAPE = (abs(test_y - test_y_pred) / abs(test_y)).mean()
MAPE