# Predicting Consumer Demand using Ridge, Lasso and ElasticCV

#### Import the necessary libraries and dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Machine Learnings/customers.csv")
data.head()

Unnamed: 0,CustomerKey,BirthDate,Education,Gender,MaritalStatus,Occupation,Sum of YearlyIncome,Sum of DiscountAmount,Sum of DiscountQuantity,Sum of SalesAmount,Sum of SalesQuantity,Average of UnitPrice
0,1,1966-04-08 00:00:00,Bachelors,M,M,Professional,90000,,,,,
1,2,1965-05-14 00:00:00,Bachelors,M,S,Professional,60000,,,,,
2,3,1965-08-12 00:00:00,Bachelors,M,M,Professional,60000,41.99,4.0,197.97,4.0,59.99
3,4,1968-02-15 00:00:00,Bachelors,F,S,Professional,70000,0.0,0.0,59.99,1.0,59.99
4,5,1968-08-08 00:00:00,Bachelors,F,S,Professional,80000,21.0,2.0,158.97,3.0,59.99


#### Data Exploration Analysis

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18484 entries, 0 to 18483
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   CustomerKey              18484 non-null  int64  
 1   BirthDate                18484 non-null  object 
 2   Education                18484 non-null  object 
 3   Gender                   18484 non-null  object 
 4   MaritalStatus            18484 non-null  object 
 5   Occupation               18484 non-null  object 
 6   Sum of YearlyIncome      18484 non-null  int64  
 7   Sum of DiscountAmount    17647 non-null  float64
 8   Sum of DiscountQuantity  17647 non-null  float64
 9   Sum of SalesAmount       17647 non-null  float64
 10  Sum of SalesQuantity     17647 non-null  float64
 11  Average of UnitPrice     17647 non-null  float64
dtypes: float64(5), int64(2), object(5)
memory usage: 1.7+ MB


#### Feature engineering

* Rename all the variables having 'sum' in the name
* Generate Age of the consumer
* Remove null values

In [None]:
# Rename columns

newnames = {'Sum of YearlyIncome':'Income','Sum of DiscountAmount':'DiscountAmt','Sum of DiscountQuantity':'DiscountQty','Average of UnitPrice':'Price',
            'Sum of SalesAmount':'Sales','Sum of SalesQuantity':'Quantity'}
data.rename(columns=newnames,inplace=True)

#### Generate age of the consumer

In [None]:
import datetime

def from_dob_to_age(born):
    today = datetime.date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

In [None]:
data['BirthDate'] =  pd.to_datetime(data['BirthDate'])
data['Age']=data['BirthDate'].apply(lambda x: from_dob_to_age(x))

#### Drop missing values

In [None]:
df = data.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17647 entries, 2 to 18483
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   CustomerKey    17647 non-null  int64         
 1   BirthDate      17647 non-null  datetime64[ns]
 2   Education      17647 non-null  object        
 3   Gender         17647 non-null  object        
 4   MaritalStatus  17647 non-null  object        
 5   Occupation     17647 non-null  object        
 6   Income         17647 non-null  int64         
 7   DiscountAmt    17647 non-null  float64       
 8   DiscountQty    17647 non-null  float64       
 9   Sales          17647 non-null  float64       
 10  Quantity       17647 non-null  float64       
 11  Price          17647 non-null  float64       
 12  Age            17647 non-null  int64         
dtypes: datetime64[ns](1), float64(5), int64(3), object(4)
memory usage: 1.9+ MB


#### Create features for maching learnings

In [None]:
# create numerical and categorical variables

df_num = df.select_dtypes(exclude='object')
df_obj = df.select_dtypes(include='object')

In [None]:
# Convert df_obj to dummy variables

df_obj = pd.get_dummies(df_obj,drop_first=True,prefix_sep=('*'))

In [None]:
df_obj.head()

Unnamed: 0,Education*Graduate Degree,Education*High School,Education*Partial College,Education*Partial High School,Gender*M,MaritalStatus*S,Occupation*Management,Occupation*Manual,Occupation*Professional,Occupation*Skilled Manual
2,0,0,0,0,1,0,0,0,1,0
3,0,0,0,0,0,1,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0
5,0,0,0,0,1,1,0,0,1,0
6,0,0,0,0,0,1,0,0,1,0


In [None]:
# combine numerical and categorical data

df_data=pd.concat([df_num,df_obj],axis=1)

In [None]:
df_data.shape

(17647, 19)

In [None]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17647 entries, 2 to 18483
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   CustomerKey                    17647 non-null  int64         
 1   BirthDate                      17647 non-null  datetime64[ns]
 2   Income                         17647 non-null  int64         
 3   DiscountAmt                    17647 non-null  float64       
 4   DiscountQty                    17647 non-null  float64       
 5   Sales                          17647 non-null  float64       
 6   Quantity                       17647 non-null  float64       
 7   Price                          17647 non-null  float64       
 8   Age                            17647 non-null  int64         
 9   Education*Graduate Degree      17647 non-null  uint8         
 10  Education*High School          17647 non-null  uint8         
 11  Education*Parti

In [None]:
# Rename columns with longer names

namesdf = {'Education*Graduate Degree':'graduate','Education*High School':'HighSchool','Education*Partial College':'PartialCollege',
           'Education*Partial High School':'PartialHighSc','Gender*M':'Male','MaritalStatus*S':'Single','Occupation*Management':'ManagerCdr',
           'Occupation*Manual':'ManualCdr','Occupation*Professional':'ProfessionalCdr','Occupation*Skilled Manual':'SkilledManualCdr'}

df_data1 = df_data.rename(columns=namesdf) 

In [None]:
df_data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17647 entries, 2 to 18483
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   CustomerKey       17647 non-null  int64         
 1   BirthDate         17647 non-null  datetime64[ns]
 2   Income            17647 non-null  int64         
 3   DiscountAmt       17647 non-null  float64       
 4   DiscountQty       17647 non-null  float64       
 5   Sales             17647 non-null  float64       
 6   Quantity          17647 non-null  float64       
 7   Price             17647 non-null  float64       
 8   Age               17647 non-null  int64         
 9   graduate          17647 non-null  uint8         
 10  HighSchool        17647 non-null  uint8         
 11  PartialCollege    17647 non-null  uint8         
 12  PartialHighSc     17647 non-null  uint8         
 13  Male              17647 non-null  uint8         
 14  Single            1764

In [None]:
X = df_data1.drop(['CustomerKey','BirthDate','DiscountQty','Sales','Quantity'],axis=1)
y = df_data1['Sales']

### Create interactions among the variables

In consumer behavior, demand is not only determined by the individual variables alone, but with some interactive effect on other socio-demographic variables.

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
PolyFeat = PolynomialFeatures(degree=2,interaction_only=True,include_bias=False)
X = pd.DataFrame(data=PolyFeat.fit_transform(X),columns=PolyFeat.get_feature_names(X.columns))

In [None]:
X.shape

(17647, 105)

We now have 105 columns for predicting sales.  Here we shall do a feature selection using lasso/ridge/elasticnet estimators.

#### Split the data into train and test sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

#### Transform the datasets

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

#### Modelling

* create Ridge Regression
* create Lasso Regression
* Create ElasticNet

### Ridge Regression Modeling

In [None]:
from sklearn.linear_model import RidgeCV
ridge_cv_model = RidgeCV(alphas=(0.1, 1.0, 10.0),scoring='neg_mean_absolute_error')

In [None]:
ridge_cv_model.fit(X_train,y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), scoring='neg_mean_absolute_error')

In [None]:
r_train_predictions = ridge_cv_model.predict(X_train)
r_test_predictions = ridge_cv_model.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [None]:
# Performance Evaluation on Train and Test datasets

MAE_tr = mean_absolute_error(y_train,r_train_predictions)
MSE_tr = mean_squared_error(y_train,r_train_predictions)
RMSEr_tr = np.sqrt(MSE_tr)

MAE_tt = mean_absolute_error(y_test,r_test_predictions)
MSE_tt = mean_squared_error(y_test,r_test_predictions)
RMSEr_tt = np.sqrt(MSE_tt)

### Modelling with Lasso Regression

In [None]:
from sklearn.linear_model import LassoCV

In [None]:
lasso_cv_model = LassoCV(eps=0.1,n_alphas=100,cv=5)
lasso_cv_model.fit(X_train,y_train)

LassoCV(cv=5, eps=0.1)

In [None]:
l_train_predictions = lasso_cv_model.predict(X_train)
l_test_predictions = lasso_cv_model.predict(X_test)

In [None]:
# Performance Evaluation on Train and Test datasets

MAE_tr_l = mean_absolute_error(y_train,l_train_predictions)
MSE_tr_l = mean_squared_error(y_train,l_train_predictions)
RMSEr_tr_l = np.sqrt(MSE_tr_l)

MAE_tt_l = mean_absolute_error(y_test,l_test_predictions)
MSE_tt_l = mean_squared_error(y_test,l_test_predictions)
RMSEr_tt_l = np.sqrt(MSE_tt_l)

### Modelling with Elasticnet model

In [None]:
from sklearn.linear_model import ElasticNetCV

In [None]:
elastic_model = ElasticNetCV(l1_ratio=[.1, .5, .7,.9, .95, .99, 1],tol=0.01)
elastic_model.fit(X_train,y_train)

ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], tol=0.01)

In [None]:
e_train_predictions = elastic_model.predict(X_train)
e_test_predictions = elastic_model.predict(X_test)

In [None]:
# Performance Evaluation on Train and Test datasets

MAE_tr_e = mean_absolute_error(y_train,e_train_predictions)
MSE_tr_e = mean_squared_error(y_train,e_train_predictions)
RMSEr_tr_e = np.sqrt(MSE_tr_e)

MAE_tt_e = mean_absolute_error(y_test,e_test_predictions)
MSE_tt_e = mean_squared_error(y_test,e_test_predictions)
RMSEr_tt_e = np.sqrt(MSE_tt_e)

### Comparing performance metrics

In [None]:
metrics_mae = {'Train':[MAE_tr,MAE_tr_l,MAE_tr_e],'Test':[MAE_tt,MAE_tt_l,MAE_tt_e]}
results_mae = pd.DataFrame(metrics_mae,index=['Ridgecv','Lassocv','Elasticnetcv'])
results_mae

Unnamed: 0,Train,Test
Ridgecv,3972.750248,3951.600908
Lassocv,78.582601,76.641858
Elasticnetcv,77.423902,75.773225


In [None]:
metrics_rmsq = {'Train':[RMSEr_tr,RMSEr_tr_l,RMSEr_tr_e],'Test':[RMSEr_tt,RMSEr_tt_l,RMSEr_tt_e]}
results_rmsq = pd.DataFrame(metrics_mae,index=['Ridgecv','Lassocv','Elasticnetcv'])
results_rmsq

Unnamed: 0,Train,Test
Ridgecv,3972.750248,3951.600908
Lassocv,78.582601,76.641858
Elasticnetcv,77.423902,75.773225


### Features and coefficients utilized by the models

In [None]:
## RidgeCV coefficients

rcvcoef_df = pd.DataFrame(ridge_cv_model.coef_,X.columns,columns=['Coefficient'])
rcv = rcvcoef_df[rcvcoef_df['Coefficient']>0].sort_values('Coefficient',ascending=False)

In [None]:
rcv.head(10)

Unnamed: 0,Coefficient
ProfessionalCdr,947.759649
SkilledManualCdr,474.988969
PartialCollege,90.128788
HighSchool,86.95375
PartialHighSc,67.629471
PartialCollege ProfessionalCdr,46.854181
graduate ManualCdr,34.405723
graduate ProfessionalCdr,31.929519
Single,29.169763
PartialHighSc Single,22.486721


In [None]:
## LassoCV coefficients

lassocoef_df = pd.DataFrame(lasso_cv_model.coef_,X.columns,columns=['Coefficient'])
lassocoef_df
lasso_var = lassocoef_df[lassocoef_df['Coefficient']>0]

In [None]:
lasso_var

Unnamed: 0,Coefficient
Income DiscountAmt,6.2e-05


In [None]:
lasso_cv_model.coef_

array([-0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00,  6.22995297e-05, -1.95036439e-05,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00,  

In [None]:
## LassoCV coefficients

elastic_df = pd.DataFrame(elastic_model.coef_,X.columns,columns=['Coefficient'])
elastic_df
elastic_var = elastic_df[elastic_df['Coefficient']>0].sort_values('Coefficient',ascending=False)

In [None]:
elastic_var

Unnamed: 0,Coefficient
Income HighSchool,0.000193
Income SkilledManualCdr,8.9e-05
Income DiscountAmt,7.4e-05
Income PartialCollege,2.9e-05
Income Age,1.7e-05
