# Two-Way ANOVA
## Used to verify how does each factor explain variation in the dependent variable

    
    Bandwidth_GB_Year ~ OnlineBackup + OnlineSecurity + StreamingMovies + StreamingTV + InternetService + Churn

    Tenure ~ OnlineBackup + DeviceProtection + Churn

    MonthlyCharge ~ OnlineBackup + OnlineSecurity + DeviceProtection + StreamingMovies + StreamingTV + InternetService + TechSupport + Churn + Multiple

    
H0: A_µ != B_µ != C_µ

H1: NO statistic significant difference between groups
    
    If p-value <= alpha: significant result, Fail to reject H0. Means are different.
    If p-value > alpha: not significant result, Reject H0. Means are the same.


## Load dataset

In [3]:
#churn_clean_altered.csv
docId= "1-WjyGAwXhgkEMSGk1PHKMjIASkVgn-YO"
googleDriveFile = "https://docs.google.com/uc?id="+docId+"&export=download"

# import into data frame
import pandas as pd
df = pd.read_csv(googleDriveFile, index_col=0)

# Create subsets for categorical and continuos data
## categorical_columns, continuous_columns

In [3]:
categorical_columns = [
                  "Churn",
                  "Generation",
                 "State",
                 "Area",
                 "Marital",
                 "Gender",
                 "Techie",
                 "InternetService",
                 "Multiple",
                 "OnlineBackup",
                 "DeviceProtection",
                 "StreamingTV",
                 "StreamingMovies",
                 "Port_modem",
                 "Tablet",
                 "OnlineSecurity",
                 "TechSupport",
                 "Contract",
                 "PaperlessBilling",
                 "PaymentMethod",
                 "Item1",
                 "Item2",
                 "Item3"
                 ]

continuous_columns = [
                  "Income",
                  "Tenure",
                  "Outage_sec_perweek",
                  "MonthlyCharge",
                  "Bandwidth_GB_Year",
                                  ]
#Filter columns
# categorical_df = df.loc[:, categorical_columns]
# continuous_df = df.loc[:, continuous_columns]

# ANOVA sample

In [6]:
# Predictor Factor -> Target
# StreamingMovies + StreamingTV -> MonthlyCharge

#MonthlyCharge ~ OnlineBackup + OnlineSecurity + DeviceProtection + StreamingMovies + StreamingTV + InternetService + TechSupport + Churn + Multiple

import statsmodels.api as sm
from statsmodels.formula.api import ols

formula = 'MonthlyCharge ~ StreamingMovies + StreamingTV'

model = ols(formula, data=df).fit()

aov_table = sm.stats.anova_lm(model, typ=2)

print(aov_table)

# # Plot density graph 
# import plotnine as p9
# print(
#     p9.ggplot(df) +
#     p9.aes(x='Bandwidth_GB_Year', fill='Generation') +
#     p9.geom_density(alpha=0.05)
# )

                       sum_sq      df            F  PR(>F)
StreamingMovies  6.933963e+06     1.0  9606.497689     0.0
StreamingTV      4.404334e+06     1.0  6101.882552     0.0
Residual         7.215827e+06  9997.0          NaN     NaN


In [5]:
# Predictor Factor -> Target
# StreamingMovies + StreamingTV -> MonthlyCharge

#MonthlyCharge ~ OnlineBackup + OnlineSecurity + DeviceProtection + StreamingMovies + StreamingTV + InternetService + TechSupport + Churn + Multiple

import statsmodels.api as sm
from statsmodels.formula.api import ols

formula = 'MonthlyCharge ~ OnlineBackup + OnlineSecurity + DeviceProtection + StreamingMovies + StreamingTV + InternetService + TechSupport + Churn + Multiple'

model = ols(formula, data=df).fit()

aov_table = sm.stats.anova_lm(model, typ=2)

print(aov_table)

                        sum_sq      df             F        PR(>F)
OnlineBackup      1.244328e+06     1.0  16254.138661  0.000000e+00
OnlineSecurity    1.683154e+04     1.0    219.863299  3.226676e-49
DeviceProtection  3.786558e+05     1.0   4946.221575  0.000000e+00
StreamingMovies   6.069675e+06     1.0  79285.607021  0.000000e+00
StreamingTV       4.084022e+06     1.0  53347.854234  0.000000e+00
InternetService   1.731842e+06     2.0  11311.159056  0.000000e+00
TechSupport       3.663906e+05     1.0   4786.005743  0.000000e+00
Churn             8.013346e+03     1.0    104.674964  1.899356e-24
Multiple          2.546524e+06     1.0  33264.170249  0.000000e+00
Residual          7.647035e+05  9989.0           NaN           NaN


In [6]:
# Predictor Factor -> Target
# StreamingMovies + StreamingTV -> MonthlyCharge

import statsmodels.api as sm
from statsmodels.formula.api import ols

formula = 'MonthlyCharge ~ Contract + PaperlessBilling + PaymentMethod'

model = ols(formula, data=df).fit()

aov_table = sm.stats.anova_lm(model, typ=2)

print(aov_table)

                        sum_sq      df         F    PR(>F)
Contract          3.607978e+03     2.0  0.978053  0.376079
PaperlessBilling  3.568786e+01     1.0  0.019349  0.889375
PaymentMethod     3.670194e+03     3.0  0.663279  0.574539
Residual          1.843179e+07  9993.0       NaN       NaN


# Insigts
Factor affects Target

