<a href="https://colab.research.google.com/github/deepanshuMeteor/DataAnalysis-Statitics/blob/main/19_Exercise_5_MLR_train_test_tutor_led_solution_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Multiple Linear Regression - TRAIN / TEST - income example with categorical data

In [None]:
# Import packages
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# The fundamental data type of pandas is the data frame

In [None]:
# Get the data - known inputs and outputs
data = pd.read_csv('income_data_categ.csv')
data.head()

Unnamed: 0,Income,Age,Education,Smoker,Residence
0,113,69,12,1,South
1,91,52,18,0,West
2,121,65,14,0,West
3,81,58,12,0,West
4,68,31,16,1,West


In [None]:
# define Residence as categorical
data['Residence'] = data['Residence'].astype('category')

In [None]:
# define the variables

# dependent variable (output) - y - income
y = data['Income']

# continuous and binary independent variables (inputs) - x - age, education, smoker
x = data[['Age', 'Education', 'Smoker']]
# to x add a column of ones
x = sm.add_constant(x)

In [None]:
y.head()

Unnamed: 0,Income
0,113
1,91
2,121
3,81
4,68


In [None]:
x.head()

Unnamed: 0,const,Age,Education,Smoker
0,1.0,69,12,1
1,1.0,52,18,0
2,1.0,65,14,0
3,1.0,58,12,0
4,1.0,31,16,1


In [None]:
# create a set of dummy variables for the categorical variable Residence
d_Residence = pd.get_dummies(data['Residence'])
d_Residence.head(20)

Unnamed: 0,East,North,South,West
0,False,False,True,False
1,False,False,False,True
2,False,False,False,True
3,False,False,False,True
4,False,False,False,True
5,False,False,False,True
6,False,False,False,True
7,False,False,False,True
8,False,True,False,False
9,False,True,False,False


In [None]:
# we need one less dummy variable than the number of levels of the categorical variables,
# otherwise we will introduce multicollinearity.
d_Residence.drop(columns='East',inplace=True)
d_Residence.head(20)

Unnamed: 0,North,South,West
0,False,True,False
1,False,False,True
2,False,False,True
3,False,False,True
4,False,False,True
5,False,False,True
6,False,False,True
7,False,False,True
8,True,False,False
9,True,False,False


In [None]:
# add the dummy variables to x
x = pd.concat([x, d_Residence], axis=1)
x.head()

Unnamed: 0,const,Age,Education,Smoker,North,South,West
0,1.0,69,12,1,False,True,False
1,1.0,52,18,0,False,False,True
2,1.0,65,14,0,False,False,True
3,1.0,58,12,0,False,False,True
4,1.0,31,16,1,False,False,True


In [None]:
# split into training and testing (hold out) set
train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=0.8, random_state=123)

In [None]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((1200, 7), (1200,), (300, 7), (300,))

In [None]:
# Create and fit the model using the training set - train_x and train_y
# NOTE: the first parameter is y, the second is x
model = sm.OLS(train_y.astype(float),train_x.astype(float))

In [None]:
# Get the results
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 Income   R-squared:                       0.791
Model:                            OLS   Adj. R-squared:                  0.790
Method:                 Least Squares   F-statistic:                     751.3
Date:                Wed, 31 Jul 2024   Prob (F-statistic):               0.00
Time:                        19:38:47   Log-Likelihood:                -4355.4
No. Observations:                1200   AIC:                             8725.
Df Residuals:                    1193   BIC:                             8760.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          8.9313      1.878      4.755      0.0

The p-value for Smoker is 0.094 > 0.05. This variable is not significant statistically and is a bad predictor.
It has to be removed.

In [None]:
# remove Smoker from the data
x.drop('Smoker',axis=1,inplace=True)

In [None]:
# split into training and testing (hold out) set
train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=0.8, random_state=123)
train_x.shape, train_y.shape, test_x.shape, test_y.shape

In [None]:
# rebuild the model
model = sm.OLS(train_y,train_x)

In [None]:
# Get the results
results = model.fit()
print(results.summary())

In [None]:
# prediction using the test data
test_y_pred = results.predict(test_x)

In [None]:
# Mean Absolute Percentage Error (MAPE)
MAPE = (abs(test_y - test_y_pred) / abs(test_y)).mean()
MAPE