In [19]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Lasso
import statsmodels.api as sm
from sklearn import metrics
from sklearn import linear_model
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

import warnings
warnings.filterwarnings("ignore")
np.random.seed(1907)

df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

print("The data size:", df.shape)

## Convert TotalCharges to numeric
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors='coerce')

## Replace yes and No in the Churn column to 1 and 0. 1 for the event and 0 for the censured data.
df['Churn']=df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0 )

## Impute the null value with the median value

df.TotalCharges.fillna(value=df['TotalCharges'].median(),inplace=True)

The data size: (7043, 21)


In [20]:
df= df.drop('customerID', axis = 1)

df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [21]:
df = pd.get_dummies(df)
print("The data size:", df.shape)

The data size: (7043, 46)


In [22]:
labels = np.array(df['TotalCharges'])
df= df.drop('TotalCharges', axis = 1)
df= df.drop('MonthlyCharges', axis = 1)



train_x, test_x, train_y, test_y = train_test_split(df, labels, test_size = 0.25, shuffle=False)

print(train_x.shape)
print(test_x.shape)

train_xC = sm.add_constant(train_x) 
test_xC = sm.add_constant(test_x) 

reg1 = sm.OLS(train_y, train_xC)
type(reg1)

results = reg1.fit()
type(results)

print(results.summary())

testpred = results.predict(test_xC)

print ("OLS Test MSE: ", metrics.mean_squared_error(test_y, testpred))

(5282, 44)
(1761, 44)
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.906
Model:                            OLS   Adj. R-squared:                  0.905
Method:                 Least Squares   F-statistic:                     2295.
Date:                Wed, 28 Oct 2020   Prob (F-statistic):               0.00
Time:                        11:03:39   Log-Likelihood:                -42073.
No. Observations:                5282   AIC:                         8.419e+04
Df Residuals:                    5259   BIC:                         8.434e+04
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                                              coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------

In [23]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(train_x)
# Apply transform to both the training set and the test set.
train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)

from sklearn.decomposition import PCA
# Make an instance of the Model
#pca = PCA(.80)
pca = PCA(n_components=25)

pca.fit(train_x)

train_x = pca.transform(train_x)
test_x = pca.transform(test_x)

train_x = sm.add_constant(train_x) ## let's add an intercept (beta_0) to our model
test_x = sm.add_constant(test_x) 

lm4 = sm.OLS(train_y,train_x).fit()

print(lm4.summary())

y_pred4 = lm4.predict(test_x) 

print(metrics.mean_absolute_error(test_y, y_pred4))
print(metrics.mean_squared_error(test_y, y_pred4))
print(np.sqrt(metrics.mean_squared_error(test_y, y_pred4)))


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.906
Model:                            OLS   Adj. R-squared:                  0.905
Method:                 Least Squares   F-statistic:                     2295.
Date:                Wed, 28 Oct 2020   Prob (F-statistic):               0.00
Time:                        11:03:39   Log-Likelihood:                -42073.
No. Observations:                5282   AIC:                         8.419e+04
Df Residuals:                    5259   BIC:                         8.434e+04
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       2280.7611      9.607    237.401      0.0

In [25]:
print(pca.components_)

[[-7.59070517e-02  1.35092048e-02 -9.81206267e-02 ...  9.90437175e-03
  -1.16457717e-01  1.11559986e-01]
 [-3.12261739e-02  3.36593421e-01 -1.69327604e-01 ...  1.20646819e-01
  -1.26679345e-01 -8.48430124e-02]
 [-1.03183827e-01 -6.37757956e-02 -8.71342354e-02 ...  1.42154133e-02
  -1.20821477e-01  1.23810974e-01]
 ...
 [ 6.22888165e-16  2.75420993e-16  1.91189060e-16 ... -8.11446183e-03
  -9.27167277e-03 -8.22338790e-03]
 [ 1.56905114e-16  8.98823645e-17 -1.96306987e-17 ...  3.71620565e-02
   4.24617719e-02  3.76609087e-02]
 [ 6.31199742e-17  9.75480762e-17 -1.26473866e-16 ...  2.85987628e-02
   3.26772589e-02  2.89826639e-02]]


In [26]:
pca.components_[0]

array([-0.07590705,  0.0135092 , -0.09812063, -0.00060987,  0.00060987,
       -0.02201186,  0.02201186, -0.06738636,  0.06738636, -0.06064077,
        0.06064077,  0.1070503 , -0.06064077, -0.07253896, -0.09466237,
       -0.15874299,  0.2993286 , -0.18128845,  0.2993286 , -0.07305524,
       -0.15220811,  0.2993286 , -0.10137178, -0.15373597,  0.2993286 ,
       -0.09975452, -0.18125866,  0.2993286 , -0.07308817, -0.13136484,
        0.2993286 , -0.12212628, -0.12952167,  0.2993286 , -0.12397503,
       -0.10125172,  0.03179714,  0.08787928,  0.12716037, -0.12716037,
        0.01010711,  0.00990437, -0.11645772,  0.11155999])