<a href="https://colab.research.google.com/github/deepanshuMeteor/DataAnalysis-Statitics/blob/main/18_Exercise_4_MLR_tutor_led_solution_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Multiple Linear Regression - income example with categorical data

In [1]:
# Import packages
import pandas as pd
import statsmodels.api as sm

# The fundamental data type of pandas is the data frame

In [2]:
# Get the data - known inputs and outputs
data = pd.read_csv('income_data_categ.csv')
data.head()

Unnamed: 0,Income,Age,Education,Smoker,Residence
0,113,69,12,1,South
1,91,52,18,0,West
2,121,65,14,0,West
3,81,58,12,0,West
4,68,31,16,1,West


In [3]:
# define Residence as categorical
data['Residence'] = data['Residence'].astype('category')

In [4]:
# define the variables

# dependent variable (output) - y - income
y = data['Income']

# continuous and binary independent variables (inputs) - x - age, education, smoker
x = data[['Age', 'Education', 'Smoker']]
# to x add a column of ones
x = sm.add_constant(x)

In [5]:
y.head()

Unnamed: 0,Income
0,113
1,91
2,121
3,81
4,68


In [6]:
x.head()
MODEL=sm.OLS(y,x)

In [7]:
# create a set of dummy variables for the categorical variable Residence
d_Residence = pd.get_dummies(data['Residence'])
d_Residence.head(20)

Unnamed: 0,East,North,South,West
0,False,False,True,False
1,False,False,False,True
2,False,False,False,True
3,False,False,False,True
4,False,False,False,True
5,False,False,False,True
6,False,False,False,True
7,False,False,False,True
8,False,True,False,False
9,False,True,False,False


In [8]:
# we need one less dummy variable than the number of levels of the categorical variables,
# otherwise we will introduce multicollinearity.
d_Residence.drop(columns='East',inplace=True)
d_Residence.head(20)

Unnamed: 0,North,South,West
0,False,True,False
1,False,False,True
2,False,False,True
3,False,False,True
4,False,False,True
5,False,False,True
6,False,False,True
7,False,False,True
8,True,False,False
9,True,False,False


In [9]:
# add the dummy variables to x
x = pd.concat([x, d_Residence[['North','South','West']]], axis=1)
x.head()

Unnamed: 0,const,Age,Education,Smoker,North,South,West
0,1.0,69,12,1,False,True,False
1,1.0,52,18,0,False,False,True
2,1.0,65,14,0,False,False,True
3,1.0,58,12,0,False,False,True
4,1.0,31,16,1,False,False,True


In [11]:
# Create and fit the model
# NOTE: the first parameter is y, the second is x
model = sm.OLS(y.astype(float),x.astype(float))

In [12]:
# Get the results
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 Income   R-squared:                       0.794
Model:                            OLS   Adj. R-squared:                  0.793
Method:                 Least Squares   F-statistic:                     957.3
Date:                Wed, 31 Jul 2024   Prob (F-statistic):               0.00
Time:                        19:39:50   Log-Likelihood:                -5438.1
No. Observations:                1500   AIC:                         1.089e+04
Df Residuals:                    1493   BIC:                         1.093e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          9.8758      1.659      5.954      0.0

The p-value for Smoker is 0.17 > 0.05. This variable is not significant statistically and is a bad predictor.
It has to be removed.

In [None]:
# remove Smoker
x.drop('Smoker',axis=1,inplace=True)
x.head()

In [None]:
# rebuild the model
model = sm.OLS(y,x)

In [None]:
# Get the results
results = model.fit()
print(results.summary())

In [None]:
# prediction (in sample)
yp = results.fittedvalues
print('predicted y:',yp,sep='\n')

In [None]:
# prediction (in sample) another way
yp = results.predict(x)
print('predicted y:',yp,sep='\n')

In [None]:
# prediction (out of sample)
data_new = [ [25, 10, 1, 0, 0], [35, 15, 0, 1, 0], [50, 10, 0, 0, 0] ]
x_new = pd.DataFrame(data_new, columns = ['Age','Education', 'North', 'South', 'West' ])
x_new = sm.add_constant(x_new)

y_new = results.predict(x_new)
print('predicted y:',y_new,sep='\n')

In [None]:
# What income would the model predict for a newborn (Age = 0 and Education = 0) ?
data_new = [ [0, 0, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 0] ]
x_new = pd.DataFrame(data_new, columns = ['Age','Education', 'North', 'South', 'West' ])
x_new = sm.add_constant(x_new)

y_new = results.predict(x_new)
print('predicted y:',y_new,sep='\n')

In [None]:
# Surely this can't be right ! Or can it ?
# Let's check the model
print(results.summary())

In [None]:
# The intercept and all coefficients are positive.
# The model will always predict positive values.
# We can "force" the model to have intercept = 0, i.e. to pass
# through the origin of the coordinate system.
# This is our model input now:
x.head()

In [None]:
# To have intercept zero we have to remove the first (const) column:
x.drop('const',axis=1,inplace=True)
x.head()

In [None]:
# build the no-intercept model
model_0 = sm.OLS(y,x)

In [None]:
# Get the results
results_0 = model_0.fit()
print(results_0.summary())

In [None]:
# However, this is not the right thing to do.
# The data we have used to train the model is for people of ages between
x['Age'].min(), x['Age'].max()

In [None]:
# ... and education years between
x['Education'].min(), x['Education'].max()

In [None]:
# Age = 0 and Education = 0 are far away from the data we used to
# train the model.
# A predictive model can be relied upon only when the input is
# within the range of the training data. It has not "learnt" what
# happens outside this range.