In [None]:
# Importing the libraries needed.

# Data science libraries.
import numpy as np
import pandas as pd
from pandas import DataFrame

# Visualization libraries.
import seaborn as sns
import plotnine as p9
import matplotlib.pyplot as plt
%matplotlib inline

# Statistics packages.
import pylab as py
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statistics
from scipy import stats

# Chisquare package.
from scipy.stats import chisquare
from scipy.stats import chi2_contingency

In [None]:
# Load the data.
df = pd.read_csv('/Users/jillrivera/Documents/WGU/206 Data Cleaning/Churn Data/churn_clean.csv')
df.head()

In [None]:
# Get a list of the columns/variables.
df.columns

In [None]:
# Drop the irrelevant variables.
df2 = df.drop(['Unnamed: 0', 'Customer_id', 'Interaction', 'City', 'State', 'County',
       'Zip', 'Lat', 'Lng', 'Population', 'Area', 'Timezone', 'Job', 'Education', 'Employment'], axis=1)
df2.head()

In [None]:
df2.columns

In [None]:
# View a correlation heatmap of the variiables.
plt.figure(figsize=(16, 6))
heatmap = sns.heatmap(df2.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)

In [None]:
# We can see that Tenure, Bandwidth_GB_Year, Timely Response and Timely Fixes are higly correlated. 
# While Options and Reliability are the lowest correlating variables. 

In [None]:
# Show a Q-Q plot of the highly corelated data.
sm.qqplot(df2['Tenure'], line = '45')
py.show()

In [None]:
sm.qqplot(df2['Bandwidth_GB_Year'], line = '45')
py.show()

In [None]:
sm.qqplot(df2['TimelyResponse'], line = '45')
py.show()

In [None]:
sm.qqplot(df2['TimelyFixes'], line = '45')
py.show()

In [None]:
# From this we determine that Timely Response and Timely Fixes both follow standard deviation the closest. 

In [None]:
sm.qqplot(df2['Gender'], line = '45')
py.show()

In [None]:
sm.qqplot(df2['Churn'], line = '45')
py.show()

In [None]:
# Univariate Statistics on a categorical variable.
print(p9.ggplot(df2)+ p9.aes(x='TimelyResponse', fill='Churn')+ p9.geom_density(alpha=0.5))

In [None]:
# Univariate Statistics on a categorical variable.
plotChurnVTimelyFixes = p9.ggplot(df2)+ p9.aes('Churn', 'TimelyFixes')+ p9.geom_boxplot()
print(plotChurnVTimelyFixes)

In [None]:
# Univariate Statistics on a continuous variable.
plotChurnVTenure = p9.ggplot(df2)+ p9.aes('Churn','Tenure')+ p9.geom_boxplot()
print(plotChurnVTenure)

In [None]:
# Univariate Statistics on a categorical variable.
print(p9.ggplot(df2)+ p9.aes(x='Bandwidth_GB_Year', fill='Churn')+ p9.geom_density(alpha=0.5))

In [None]:
# Bivariate Statistics of categorical variables.
print(p9.ggplot(df2)+ p9.aes(x='TimelyResponse', y='TimelyFixes', color='Churn')+ p9.geom_point())

In [None]:
# Bivariate Statistics of continuous variables.
print(p9.ggplot(df2)+ p9.aes(x='Bandwidth_GB_Year', y='Tenure', color='Churn')+ p9.geom_point())

In [None]:
# One-Way ANOVA test on Churn vs. Timely Response
ChurnVTimelyResponse = df2[df2.Churn == "Yes"].TimelyResponse
NoChurnVTimelyResponse = df2[df2.Churn == "No"].TimelyResponse
anova = stats.f_oneway(ChurnVTimelyResponse, NoChurnVTimelyResponse)
print(anova)

In [None]:
# One-Way ANOVA test on Churn vs. Timely Fixes
ChurnVTimelyFixes = df2[df2.Churn == "Yes"].TimelyFixes
NoChurnVTimelyFixes = df2[df2.Churn == "No"].TimelyFixes
anova2 = stats.f_oneway(ChurnVTimelyFixes, NoChurnVTimelyFixes)
print(anova2)

In [None]:
# Churn Chi Squared Test of Independence
ChurnRatio = df2['Churn'].value_counts()
chi = stats.chisquare(ChurnRatio)
print(chi)

# Test Significance
alpha = 0.05
if chi[1] < alpha:
    print("Difference between yes and no on churn is statistically significant")
else:
    print("No significant difference between churn response yes and no")

In [None]:
# Timely Response Chi Squared Test of Independence
ResponseRatio = df2['TimelyResponse'].value_counts()
chi2 = stats.chisquare(ResponseRatio)
print(chi2)

# Test Significance
alpha = 0.05
if chi2[1] < alpha:
    print("Difference between timely responses is statistically significant")
else:
    print("No significant difference between timely responses")

In [None]:
# Timely Fixes Chi Squared Test of Independence
FixRatio = df2['TimelyFixes'].value_counts()
chi3 = stats.chisquare(FixRatio)
print(chi3)

# Test Significance
alpha = 0.05
if chi3[1] < alpha:
    print("Difference between timely fixes is statistically significant")
else:
    print("No significant difference between timely fixes")

In [None]:
# Tenure Chi Squared Test of Independence
TenureRatio = df2['Tenure'].value_counts()
chi4 = stats.chisquare(TenureRatio)
print(chi4)

# Test Significance
alpha = 0.05
if chi4[1] < alpha:
    print("Difference between tenure is statistically significant")
else:
    print("No significant difference between tenure")

In [None]:
# Gender Chi Squared Test of Independence
GenderRatio = df2['Gender'].value_counts()
chi5 = stats.chisquare(GenderRatio)
print(chi5)

# Test Significance
alpha = 0.05
if chi5[1] < alpha:
    print("Difference between gender is statistically significant")
else:
    print("No significant difference between gender")

In [None]:
# Chi Squared Test of Independence on Churn vs. Gender

table = pd.crosstab(df2.Churn, df2.Gender)
print(table)

stats.chi2_contingency(observed= table)