<a href="https://colab.research.google.com/github/dayanaviana/WGU/blob/main/208-PredictiveAnalysis/1_2_MultipleLinearRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import Data

In [None]:
import pandas as pd

# import into data frame
churn_data = "https://raw.githubusercontent.com/dayanaviana/WGU/main/datasources/churn_clean.csv"
df = pd.read_csv(churn_data, index_col=0)
print(df.head())

          Customer_id                           Interaction  \
CaseOrder                                                     
1             K409198  aa90260b-4141-4a24-8e36-b04ce1f4f77b   
2             S120509  fb76459f-c047-4a9d-8af9-e0f7d4ac2524   
3             K191035  344d114c-3736-4be5-98f7-c72c281e2d35   
4              D90850  abfa2b40-2d43-4994-b15a-989b8c79e311   
5             K662701  68a861fd-0d20-4e51-a587-8a90407ee574   

                                        UID         City State  \
CaseOrder                                                        
1          e885b299883d4f9fb18e39c75155d990  Point Baker    AK   
2          f2de8bef964785f41a2959829830fb8a  West Branch    MI   
3          f1784cfa9f6d92ae816197eb175d3c71      Yamhill    OR   
4          dc8a365077241bb5cd5ccd305136b05e      Del Mar    CA   
5          aabb64a116e83fdc4befc1fbab1663f9    Needville    TX   

                          County    Zip       Lat        Lng  Population  ...  \
CaseOrder     

# Sample data

In [None]:
import numpy as np

rows = 1000
sample_data = df.sample(n=rows, random_state=42)
print(sample_data.head())

# Syntax:
# DataFrame.sample(n=None, frac=None, replace=False, weights=None, random_state=None, axis=None)
# Parameters:
# n: int value, Number of random rows to generate.
# frac: Float value, Returns (float value * length of data frame values ). frac cannot be used with n.
# replace: Boolean value, return sample with replacement if True.
# random_state: int value or numpy.random.RandomState, optional. if set to a particular integer, will return same rows as sample in every iteration.
# axis: 0 or ‘row’ for Rows and 1 or ‘column’ for Columns.

          Customer_id                           Interaction  \
CaseOrder                                                     
6253          S231697  e7902b9a-7c4f-4e9f-a195-af71dd6ed55f   
4685          O841175  ca5b5c0d-4270-44bc-8896-9cc97e19f577   
1732          M980625  157e72d3-6e3c-42b8-bb2a-c9d81ff46d9e   
4743           C36783  4204e941-35f9-4f29-b8dc-1c9dfc67005a   
4522          T157033  da4eb7f1-49db-4a26-86a4-3d44a6f55a58   

                                        UID         City State     County  \
CaseOrder                                                                   
6253       c8956d79d1a0b5b21d8396f9022a2d8e      Atlanta    GA     Fulton   
4685       dca68833d5ee6129eaabaeb9b48f7d23  New Meadows    ID      Adams   
1732       941fdb93ceceeed3a3f2a901c9efa085     Stilwell    KS    Johnson   
4743       e2b008d0c9e7f661122f9e3e32732471       Perris    CA  Riverside   
4522       552c76f0138485928f17eecd6d86aa3c       Hamden    OH     Vinton   

             Zip  

In [None]:
df.dtypes

Customer_id              object
Interaction              object
UID                      object
City                     object
State                    object
County                   object
Zip                       int64
Lat                     float64
Lng                     float64
Population                int64
Area                     object
TimeZone                 object
Job                      object
Children                  int64
Age                       int64
Income                  float64
Marital                  object
Gender                   object
Churn                    object
Outage_sec_perweek      float64
Email                     int64
Contacts                  int64
Yearly_equip_failure      int64
Techie                   object
Contract                 object
Port_modem               object
Tablet                   object
InternetService          object
Phone                    object
Multiple                 object
OnlineSecurity           object
OnlineBa

# Data Manipulation

In [None]:
X = df[['Gender','Marital']]
# X.dtypes
X = pd.get_dummies(data=X, drop_first=True)
X.head()

# Divorced
# Married
# Never Married
# Separated
# Widowed

Unnamed: 0_level_0,Gender_Male,Gender_Nonbinary,Marital_Married,Marital_Never Married,Marital_Separated,Marital_Widowed
CaseOrder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,0,0,0,0,1
2,0,0,1,0,0,0
3,0,0,0,0,0,1
4,1,0,1,0,0,0
5,1,0,0,0,1,0


#Multiple Linear Regression


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols

continuous_columns = [
                  "Income",
                  #"Tenure",
                  "Outage_sec_perweek",
                  "MonthlyCharge",
                  "Bandwidth_GB_Year",
                                  ]
y_target = "Tenure"
x_predictors = "" # explanatory variables

for column_name in continuous_columns:
  x_predictors += column_name + " + "
x_predictors = x_predictors[:-3] 

#Running a linear regression model
# y = intercept + slope * x
formula = y_target + " ~ " + x_predictors + " + 0" # +0 does not include a global interceptor
print("Formula = ", formula)
model = ols(formula, data=sample_data).fit()

print("\n")

print("Model =\n", model.params)
# intercept, slope = model.params

print("\n")

# Coefficient of deternination: 
# How well the linear regression line fits the observed values (larger is better)
print("R-squared = ", model.rsquared)

# Adjusted coefficient of determination
# Adds penalty when more predictors are added
print("R-squared adjusted = ", model.rsquared_adj)

# Mean squared error
print("MSE = ", model.mse_resid)

# Residual standard error:
# The tipical sie of the residuals (smaller is better)
sum_residuals_sq = sum(model.resid**2)#model.resid**2
df = len(sample_data)-2
print("RSE = ", sum_residuals_sq/df)#sum of residuals squared
print("MRSE = ", np.sqrt(sum_residuals_sq/len(sample_data)))

print("RSE = ", np.sqrt(model.mse_resid))
# We typically get the X wrong by about RSE_number


Formula =  Tenure ~ Income + Outage_sec_perweek + MonthlyCharge + Bandwidth_GB_Year + 0


Model =
 Income                0.000004
Outage_sec_perweek    0.005550
MonthlyCharge        -0.037656
Bandwidth_GB_Year     0.012049
dtype: float64


R-squared =  0.9951885200308901
R-squared adjusted =  0.9951691968181627
MSE =  9.269275003222926
RSE =  9.250699301813663
MRSE =  3.0384532089880922
RSE =  3.044548407107847


## 2-way interaction (Pairwise)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols

continuous_columns = [
                  "Income",
                  #"Tenure",
                  "Outage_sec_perweek",
                  "MonthlyCharge",
                  "Bandwidth_GB_Year",
                                  ]
y_target = "Tenure"
x_predictors = "" # explanatory variables

for column_name in continuous_columns:
  x_predictors += column_name + " + "
x_predictors = x_predictors[:-3] 


#Running a linear regression model
# y = intercept + slope * x
formula = y_target + " ~ (" + x_predictors + ")**2 + 0" # +0 does not include a global interceptor
print("Formula = ", formula)
model = ols(formula, data=sample_data).fit()

print("\n")

print("Model =\n", model.params)
# intercept, slope = model.params

print("\n")

# Coefficient of deternination: 
# How well the linear regression line fits the observed values (larger is better)
print("R-squared = ", model.rsquared)

# Adjusted coefficient of determination
# Adds penalty when more predictors are added
print("R-squared adjusted = ", model.rsquared_adj)

# Mean squared error
print("MSE = ", model.mse_resid)

# Residual standard error:
# The tipical sie of the residuals (smaller is better)
sum_residuals_sq = sum(model.resid**2)#model.resid**2
df = len(sample_data)-2
print("RSE = ", sum_residuals_sq/df)#sum of residuals squared
print("MRSE = ", np.sqrt(sum_residuals_sq/len(sample_data)))

print("RSE = ", np.sqrt(model.mse_resid))
# We typically get the X wrong by about RSE_number



Formula =  Tenure ~ (Income + Outage_sec_perweek + MonthlyCharge + Bandwidth_GB_Year)**2 + 0


Model =
 Income                                 -1.863940e-06
Outage_sec_perweek                      5.365319e-02
MonthlyCharge                          -3.210144e-02
Bandwidth_GB_Year                       1.206698e-02
Income:Outage_sec_perweek               3.781498e-07
Income:MonthlyCharge                    3.499814e-09
Income:Bandwidth_GB_Year               -2.272672e-11
Outage_sec_perweek:MonthlyCharge       -7.224177e-04
Outage_sec_perweek:Bandwidth_GB_Year    5.807571e-06
MonthlyCharge:Bandwidth_GB_Year        -5.592399e-07
dtype: float64


R-squared =  0.995218077526422
R-squared adjusted =  0.9951697752792141
MSE =  9.268165060556575
RSE =  9.193871152255513
MRSE =  3.0291060413843227
RSE =  3.0443661180213812


## 3-way interaction

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols

continuous_columns = [
                  "Income",
                  #"Tenure",
                  "Outage_sec_perweek",
                  "MonthlyCharge",
                  "Bandwidth_GB_Year",
                                  ]
y_target = "Tenure"
x_predictors = "" # explanatory variables

for column_name in continuous_columns:
  x_predictors += column_name + " * "
x_predictors = x_predictors[:-3] 

#Running a linear regression model
# y = intercept + slope * x
formula = y_target + " ~ " + x_predictors + " + 0" # "+0" does not include a global interceptor
print("Formula = ", formula)
model = ols(formula, data=sample_data).fit()

print("\n")

print("Model =\n", model.params)
# intercept, slope = model.params

print("\n")

# Coefficient of deternination: 
# How well the linear regression line fits the observed values (larger is better)
print("R-squared = ", model.rsquared)

# Adjusted coefficient of determination
# Adds penalty when more predictors are added
print("R-squared adjusted = ", model.rsquared_adj)

# Mean squared error
print("MSE = ", model.mse_resid)

# Residual standard error:
# The tipical sie of the residuals (smaller is better)
sum_residuals_sq = sum(model.resid**2)#model.resid**2
df = len(sample_data)-2
print("RSE = ", sum_residuals_sq/df)#sum of residuals squared
print("MRSE = ", np.sqrt(sum_residuals_sq/len(sample_data)))

print("RSE = ", np.sqrt(model.mse_resid))
# We typically get the X wrong by about RSE_number


Formula =  Tenure ~ Income * Outage_sec_perweek * MonthlyCharge * Bandwidth_GB_Year + 0


Model =
 Income                                                      -3.276007e-05
Outage_sec_perweek                                           8.495996e-02
Income:Outage_sec_perweek                                    2.856646e-06
MonthlyCharge                                               -3.748113e-02
Income:MonthlyCharge                                         3.251048e-07
Outage_sec_perweek:MonthlyCharge                            -3.451536e-04
Income:Outage_sec_perweek:MonthlyCharge                     -2.919166e-08
Bandwidth_GB_Year                                            1.281728e-02
Income:Bandwidth_GB_Year                                    -5.915347e-09
Outage_sec_perweek:Bandwidth_GB_Year                        -7.666500e-05
Income:Outage_sec_perweek:Bandwidth_GB_Year                  7.929841e-10
MonthlyCharge:Bandwidth_GB_Year                             -3.060970e-06
Income:Monthl

# MonthlyCharge 1st Run

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols

predictors_columns = [
                  "Income",
                  "Tenure",
                  "Outage_sec_perweek",
                  # "MonthlyCharge",
                  "Bandwidth_GB_Year",
                  "Population",
                  "Children",
                  "Age",
                  "Yearly_equip_failure",
                  "Email",
                  "Contacts",
                  "Area","Marital",
"Churn","Gender","Techie","InternetService","Multiple","OnlineBackup","DeviceProtection",
"StreamingTV","StreamingMovies","Port_modem","Tablet","OnlineSecurity","TechSupport",
"Contract","PaperlessBilling","PaymentMethod",
                                  ]
y_target = "MonthlyCharge"
x_predictors = "" # explanatory variables

for column_name in predictors_columns:
  x_predictors += column_name + " + "
x_predictors = x_predictors[:-3] # Removes last 3 charactheres from string

#Running a linear regression model
# y = intercept + slope * x
formula = y_target + " ~ " + x_predictors + " + 0" # +0 does not include a global interceptor
print("Formula = ", formula)
model = ols(formula, data=sample_data).fit()

print("\n")

print("Model =\n", model.params)
# intercept, slope = model.params

print("\n")

# Coefficient of deternination: 
# How well the linear regression line fits the observed values (larger is better)
print("R-squared = ", model.rsquared)

# Adjusted coefficient of determination
# Adds penalty when more predictors are added
print("R-squared adjusted = ", model.rsquared_adj)

# Mean squared error
print("MSE = ", model.mse_resid)

# Residual standard error:
# The tipical sie of the residuals (smaller is better)
sum_residuals_sq = sum(model.resid**2)#model.resid**2
df = len(sample_data)-2
print("RSE = ", sum_residuals_sq/df)#sum of residuals squared
print("MRSE = ", np.sqrt(sum_residuals_sq/len(sample_data)))

print("RSE = ", np.sqrt(model.mse_resid))
# We typically get the X wrong by about RSE_number


Formula =  MonthlyCharge ~ Income + Tenure + Outage_sec_perweek + Bandwidth_GB_Year + Population + Children + Age + Yearly_equip_failure + Email + Contacts + Area + Marital + Churn + Gender + Techie + InternetService + Multiple + OnlineBackup + DeviceProtection + StreamingTV + StreamingMovies + Port_modem + Tablet + OnlineSecurity + TechSupport + Contract + PaperlessBilling + PaymentMethod + 0


Model =
 Area[Rural]                                 -88.605125
Area[Suburban]                              -88.591964
Area[Urban]                                 -88.374421
Marital[T.Married]                           -0.271989
Marital[T.Never Married]                     -0.362445
Marital[T.Separated]                          0.058168
Marital[T.Widowed]                           -0.490639
Churn[T.Yes]                                  0.311647
Gender[T.Male]                              -20.466172
Gender[T.Nonbinary]                           5.948513
Techie[T.Yes]                             

# MonthlyCharge 2nd Run

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols

predictors_columns = [
                  "Tenure",
                  "Children",
"Gender","InternetService","Multiple","OnlineBackup","DeviceProtection",
"StreamingTV","StreamingMovies","OnlineSecurity","TechSupport",
                                  ]
y_target = "MonthlyCharge"
x_predictors = "" # explanatory variables

for column_name in predictors_columns:
  x_predictors += column_name + " + "
x_predictors = x_predictors[:-3] # Removes last 3 charactheres from string

#Running a linear regression model
# y = intercept + slope * x
formula = y_target + " ~ " + x_predictors + " + 0" # +0 does not include a global interceptor
print("Formula = ", formula)
model = ols(formula, data=sample_data).fit()

print("\n")

print("Model =\n", model.params)
# intercept, slope = model.params

print("\n")

# Coefficient of deternination: 
# How well the linear regression line fits the observed values (larger is better)
print("R-squared = ", model.rsquared)

# Adjusted coefficient of determination
# Adds penalty when more predictors are added
print("R-squared adjusted = ", model.rsquared_adj)

# Mean squared error
print("MSE = ", model.mse_resid)

# Residual standard error:
# The tipical sie of the residuals (smaller is better)
sum_residuals_sq = sum(model.resid**2)#model.resid**2
df = len(sample_data)-2
print("RSE = ", sum_residuals_sq/df)#sum of residuals squared
print("MRSE = ", np.sqrt(sum_residuals_sq/len(sample_data)))

print("RSE = ", np.sqrt(model.mse_resid))
# We typically get the X wrong by about RSE_number


Formula =  MonthlyCharge ~ Tenure + Children + Gender + InternetService + Multiple + OnlineBackup + DeviceProtection + StreamingTV + StreamingMovies + OnlineSecurity + TechSupport + 0


Model =
 Gender[Female]                    84.353122
Gender[Male]                      83.754204
Gender[Nonbinary]                 83.311289
InternetService[T.Fiber Optic]    20.403765
InternetService[T.None]          -13.425721
Multiple[T.Yes]                   32.980789
OnlineBackup[T.Yes]               22.950731
DeviceProtection[T.Yes]           12.258646
StreamingTV[T.Yes]                42.190328
StreamingMovies[T.Yes]            51.508631
OnlineSecurity[T.Yes]              2.644356
TechSupport[T.Yes]                12.529013
Tenure                            -0.016030
Children                           0.093858
dtype: float64


R-squared =  0.9571656077638863
R-squared adjusted =  0.9566008541137144
MSE =  77.35775654357768
RSE =  76.42760315828428
MRSE =  8.733541546930873
RSE =  8.79532583498631

# MonthlyCharge 3rd Run

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols

predictors_columns = ["InternetService","Multiple","OnlineBackup","DeviceProtection",
"StreamingTV","StreamingMovies","TechSupport",
                                  ]
y_target = "MonthlyCharge"
x_predictors = "" # explanatory variables

for column_name in predictors_columns:
  x_predictors += column_name + " + "
x_predictors = x_predictors[:-3] # Removes last 3 charactheres from string

#Running a linear regression model
# y = intercept + slope * x
formula = y_target + " ~ " + x_predictors + " + 0" # +0 does not include a global interceptor
print("Formula = ", formula)
model = ols(formula, data=sample_data).fit()

print("\n")

print("Model =\n", model.params)
# intercept, slope = model.params

print("\n")

# Coefficient of deternination: 
# How well the linear regression line fits the observed values (larger is better)
print("R-squared = ", model.rsquared)

# Adjusted coefficient of determination
# Adds penalty when more predictors are added
print("R-squared adjusted = ", model.rsquared_adj)

# Mean squared error
print("MSE = ", model.mse_resid)

# Residual standard error:
# The tipical sie of the residuals (smaller is better)
sum_residuals_sq = sum(model.resid**2)#model.resid**2
df = len(sample_data)-2
print("RSE = ", sum_residuals_sq/df)#sum of residuals squared
print("MRSE = ", np.sqrt(sum_residuals_sq/len(sample_data)))

print("RSE = ", np.sqrt(model.mse_resid))
# We typically get the X wrong by about RSE_number


Formula =  MonthlyCharge ~ InternetService + Multiple + OnlineBackup + DeviceProtection + StreamingTV + StreamingMovies + TechSupport + 0


Model =
 InternetService[DSL]             84.796991
InternetService[Fiber Optic]    105.208760
InternetService[None]            71.284700
Multiple[T.Yes]                  32.841392
OnlineBackup[T.Yes]              22.970352
DeviceProtection[T.Yes]          12.258028
StreamingTV[T.Yes]               42.103257
StreamingMovies[T.Yes]           51.391454
TechSupport[T.Yes]               12.615963
dtype: float64


R-squared =  0.9560822879303803
R-squared adjusted =  0.9557277554414227
MSE =  78.91403036303544
RSE =  78.36052514004817
MRSE =  8.84329147375388
RSE =  8.883356930971278


# MonthlyCharge 4th Run : With Interactions

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols

predictors_columns = ["InternetService","Multiple","OnlineBackup","DeviceProtection",
"StreamingTV","StreamingMovies","TechSupport",
                                  ]
y_target = "MonthlyCharge"
x_predictors = "" # explanatory variables

for column_name in predictors_columns:
  x_predictors += column_name + " * "
x_predictors = x_predictors[:-3] # Removes last 3 charactheres from string

#Running a linear regression model
# y = intercept + slope * x
formula = y_target + " ~ " + x_predictors + " + 0" # +0 does not include a global interceptor
print("Formula = ", formula)
model = ols(formula, data=sample_data).fit()

print("\n")

print("Model =\n", model.params)
# intercept, slope = model.params

print("\n")

# Coefficient of deternination: 
# How well the linear regression line fits the observed values (larger is better)
print("R-squared = ", model.rsquared)

# Adjusted coefficient of determination
# Adds penalty when more predictors are added
print("R-squared adjusted = ", model.rsquared_adj)

# Mean squared error
print("MSE = ", model.mse_resid)

# Residual standard error:
# The tipical sie of the residuals (smaller is better)
sum_residuals_sq = sum(model.resid**2)#model.resid**2
df = len(sample_data)-2
print("RSE = ", sum_residuals_sq/df)#sum of residuals squared
print("MRSE = ", np.sqrt(sum_residuals_sq/len(sample_data)))

print("RSE = ", np.sqrt(model.mse_resid))
# We typically get the X wrong by about RSE_number


Formula =  MonthlyCharge ~ InternetService * Multiple * OnlineBackup * DeviceProtection * StreamingTV * StreamingMovies * TechSupport + 0


Model =
 InternetService[DSL]                                                                                                                                        93.084562
InternetService[Fiber Optic]                                                                                                                               113.534116
InternetService[None]                                                                                                                                       81.237704
Multiple[T.Yes]                                                                                                                                             32.505537
OnlineBackup[T.Yes]                                                                                                                                         22.810473
                     

# Testing Model

https://colab.research.google.com/drive/1L0HQPCV4s2ceW_cYNQ5VcbH_ujwKKutF?usp=sharing


In [None]:
model.predict()