In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Lasso
import statsmodels.api as sm
from sklearn import metrics
from sklearn import linear_model
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

import warnings
warnings.filterwarnings("ignore")
np.random.seed(1907)

df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

print("The data size:", df.shape)

## Convert TotalCharges to numeric
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors='coerce')

## Replace yes and No in the Churn column to 1 and 0. 1 for the event and 0 for the censured data.
df['Churn']=df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0 )

## Impute the null value with the median value

df.TotalCharges.fillna(value=df['TotalCharges'].median(),inplace=True)

The data size: (7043, 21)


In [2]:
df= df.drop('customerID', axis = 1)

df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [3]:
df = pd.get_dummies(df)
print("The data size:", df.shape)

The data size: (7043, 46)


In [4]:
labels = np.array(df['TotalCharges'])
df= df.drop('TotalCharges', axis = 1)
df= df.drop('MonthlyCharges', axis = 1)



train_x, test_x, train_y, test_y = train_test_split(df, labels, test_size = 0.25, shuffle=False)

print(train_x.shape)
print(test_x.shape)

train_xC = sm.add_constant(train_x) 
test_xC = sm.add_constant(test_x) 

reg1 = sm.OLS(train_y, train_xC)
type(reg1)

results = reg1.fit()
type(results)

print(results.summary())

testpred = results.predict(test_xC)

print ("OLS Test MSE: ", metrics.mean_squared_error(test_y, testpred))

(5282, 44)
(1761, 44)
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.906
Model:                            OLS   Adj. R-squared:                  0.905
Method:                 Least Squares   F-statistic:                     2295.
Date:                Tue, 27 Oct 2020   Prob (F-statistic):               0.00
Time:                        17:36:52   Log-Likelihood:                -42073.
No. Observations:                5282   AIC:                         8.419e+04
Df Residuals:                    5259   BIC:                         8.434e+04
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                                              coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------

In [10]:
ols = LinearRegression()
rfe = RFE(ols, n_features_to_select=28)
train_x_rfe = rfe.fit_transform(train_x,train_y)  
print(rfe.support_)

[False False False False False False False False False  True  True False
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True False False False False False False]


In [11]:
#type(train_x)
train_x_RFE = train_x[train_x.columns[rfe.support_]]
test_x_RFE = test_x[test_x.columns[rfe.support_]]


print(train_x_RFE.shape)
print(test_x_RFE.shape)

train_xCRFE = sm.add_constant(train_x_RFE) 
test_xCRFE = sm.add_constant(test_x_RFE) 

reg2 = sm.OLS(train_y, train_xCRFE)
type(reg2)

results = reg2.fit()
type(results)

print(results.summary())

testpred2 = results.predict(test_xCRFE)

print ("OLS Test MSE: ", metrics.mean_squared_error(test_y, testpred2))


(5282, 28)
(1761, 28)
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.740
Method:                 Least Squares   F-statistic:                     1255.
Date:                Tue, 27 Oct 2020   Prob (F-statistic):               0.00
Time:                        17:45:22   Log-Likelihood:                -44742.
No. Observations:                5282   AIC:                         8.951e+04
Df Residuals:                    5269   BIC:                         8.960e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------

In [7]:
# Create recursive feature eliminator that scores features by mean squared errors
rfecv = RFECV(estimator=ols, step=1, scoring='neg_mean_squared_error')
# # Fit recursive feature eliminator 
# rfecv.fit(X, y)
# # Recursive feature elimination
# rfecv.transform(X)
train_x_rfecv = rfecv.fit_transform(train_x,train_y)  
print(rfecv.support_)

[False  True  True False False False False False False  True  True False
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True False False  True False  True  True]


In [8]:
rfecv.n_features_

33

In [9]:
#type(train_x)
train_x_RFECV = train_x[train_x.columns[rfecv.support_]]
test_x_RFECV = test_x[test_x.columns[rfecv.support_]]


print(train_x_RFECV.shape)
print(test_x_RFECV.shape)

train_xCRFECV = sm.add_constant(train_x_RFECV) 
test_xCRFECV = sm.add_constant(test_x_RFECV) 

reg3 = sm.OLS(train_y, train_xCRFECV)
type(reg3)

results = reg3.fit()
type(results)

print(results.summary())

testpred3 = results.predict(test_xCRFECV)

print ("OLS Test MSE: ", metrics.mean_squared_error(test_y, testpred3))


(5282, 33)
(1761, 33)
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.906
Model:                            OLS   Adj. R-squared:                  0.905
Method:                 Least Squares   F-statistic:                     2971.
Date:                Tue, 27 Oct 2020   Prob (F-statistic):               0.00
Time:                        17:41:09   Log-Likelihood:                -42074.
No. Observations:                5282   AIC:                         8.418e+04
Df Residuals:                    5264   BIC:                         8.430e+04
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                              coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------

In [None]:
ridge = Ridge()

parameters = {'alpha':[1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 3, 5, 10, 15, 20]}

ridge_regressor = GridSearchCV(ridge,parameters,scoring='neg_mean_squared_error',cv=4)
ridge_regressor.fit(train_xC, train_y)

print(ridge_regressor.best_params_)
print(ridge_regressor.best_score_)

ridge_pred = ridge_regressor.predict(test_xC.values)

print(metrics.mean_absolute_error(test_y, ridge_pred))
print(metrics.mean_squared_error(test_y, ridge_pred))
print(np.sqrt(metrics.mean_squared_error(test_y, ridge_pred)))

In [None]:
lasso = Lasso()

parameters = {'alpha':[1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 3, 5, 10, 15, 20]}

lasso_regressor = GridSearchCV(lasso,parameters,scoring='neg_mean_squared_error',cv=4)
lasso_regressor.fit(train_xC, train_y)

print(lasso_regressor.best_params_)
print(lasso_regressor.best_score_)

lasso_pred = lasso_regressor.predict(test_xC.values)

print(metrics.mean_absolute_error(test_y, lasso_pred))
print(metrics.mean_squared_error(test_y, lasso_pred))
print(np.sqrt(metrics.mean_squared_error(test_y, lasso_pred)))

In [None]:
elastic = ElasticNet()

parameters = {'alpha':[1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 3, 5, 10, 15, 20]}

elastic_regressor = GridSearchCV(elastic,parameters,scoring='neg_mean_squared_error',cv=4)
elastic_regressor.fit(train_xC, train_y)

print(elastic_regressor.best_params_)
print(elastic_regressor.best_score_)

elastic_pred = elastic_regressor.predict(test_xC.values)

print(metrics.mean_absolute_error(test_y, elastic_pred))
print(metrics.mean_squared_error(test_y, elastic_pred))
print(np.sqrt(metrics.mean_squared_error(test_y, elastic_pred)))