In [None]:
# EXERCISE 2
# Using telco_churn database, predict total charges

# a. Using the telco_churn database, extract a table for
#    each customer with a 2-year contract and include the
#    following information customer id, tenure, monthly
#    charges, and total charges.

# b. Export the table to a csv

# c. Create a jupyter notebook titled regression

# d. Prepare the Environment

# Wrangling
import pandas as pd

# Exploring
import scipy.stats as stats

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# Modeling
import statsmodels.api as sm

from scipy.stats import pearsonr

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error

In [None]:
# e. Read the information from the customers table into a dataframe.
path = '~/Documents/_datasci/Github/regression-exercises/'
df = pd.read_csv(path + "telco_churn.csv")

In [None]:
df.info()

In [None]:
# f. How many rows are in your dataframe? 1695
     
# g. What is the data type of the field containing the customer ids? object (string)

# h. Print to the notebook the last 10 rows of your dataframe.
df.tail(10)

In [None]:
df['num_total'] = pd.to_numeric(df.total_charges, errors='coerce')
df.info()

In [None]:
# i. What is the inner quartile range of tenure?
#    monthly charges? total charges?
iqr_tenure = df.tenure.quantile(.75) - df.tenure.quantile(.25)
iqr_monthly_charges = df.monthly_charges.quantile(.75) - df.monthly_charges.quantile(.25)

iqr_total_charges = df.num_total.quantile(.75) - df.num_total.quantile(.25)

In [None]:
print(iqr_tenure)
print(iqr_monthly_charges)
print(iqr_total_charges)

In [None]:
# j. How many missing values are in each variable?
print(df.isnull().sum())

In [None]:
# k. Fill any missing values with 0.
df.num_total.replace('', 0, inplace=True)

In [None]:
# TODO: fix above 2k
print(df.isnull().sum())

In [None]:
# l. Show the ditribution of monthly charges through a histogram
plt.hist(df.monthly_charges)

In [None]:
# m. Create a new dataframe that contains tenure, monthly
#    charges, and total charges, but not the customer id.

new_df = df.drop(columns=['customer_id'])
df

In [None]:
# n. Create a box plot of each variable in your new dataframe.
#    This should be a single chart with 3 categorical plots,
#    1 each for tenure, monthly charges, total charges.

plt.figure(figsize=(8,4))
sns.boxplot(data=df)

In [None]:
# o. Split your data into a test and train dataset.
#    Set the random_state to 123. You should end up with 4
#    dataframes: X_train, X_test, y_train, y_test.

# test_train_split
X = df.drop(['num_total','customer_id'], axis=1)
y = df[['num_total']]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.80, random_state=123)

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

if X_train.shape[0] == y_train.shape[0]:
    print("X & y train rows ARE equal")
else:
    print("X & y train rows ARE NOT equal")


if X_test.shape[0] == y_test.shape[0]:
    print("X & y test rows ARE equal")
else:
    print("X & y test rows ARE NOT equal")

if train.shape[1] == test.shape[1]:
    print("Number of columns in train & test ARE equal")
else:
    print("Number of columns in train & test ARE NOT equal")

train_split = train.shape[0] / (train.shape[0] + test.shape[0])
test_split = test.shape[0] / (train.shape[0] + test.shape[0])

print("Train Split: %.2f" % train_split)
print("Test Split: %.2f" % test_split)

In [None]:
# EXERCISE 3
# Using the in-sample data (X_train and y_train)

In [None]:
# a. Create a scatterplot for each combination of variables.
g = sns.PairGrid(train)
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter);

In [None]:
# b. Create a heatmap containing the 3 variables.
plt.figure(figsize=(8,4))
sns.heatmap(train.corr(), cmap='Blues', annot=True)

In [None]:
# c. Compute pearson's correlation coefficient and print it's value in the
#    sentence "Pearson's R is ____ with a significance p-value of ____"
columns = ['tenure', 'monthly_charges']
for col in columns:
    with sns.axes_style('white'):
        j = sns.jointplot(col, "num_total", data=train, kind='reg', height=5);
        j.annotate(stats.pearsonr)
        plt.show()

In [None]:
print('Pearson\'s R is .95 with a significance p-value of 0.')

In [None]:
# d. Train (aka fit) a linear regression model,
#    modeling total charges as a linear function of tenure.

# Create linear regression objects
lm1 = LinearRegression()
print(lm1)

lm1.fit(X_train[['tenure']], y_train)
print(lm1)

lm1_y_intercept = lm1.intercept_
print(lm1_y_intercept)

lm1_coefficients = lm1.coef_
print(lm1_coefficients)

In [None]:
# e. What is the y-intercept of the regression line?
#    Translate the intercept in plain english,
#    i.e. what is means in the context of the data.
# f. What the slope of the regression line? Translate
#    the slope in plain english, i.e. what is means in
#    the context of the data.
# g. Write the linear function in the form of y = mx + b
#    using the parameters that were estimated from the
#    algorithm and the variable names for y and x specific
#    to your data.
print('Univariate - total_charges = b + m * monthly_charges')
print('    y-intercept (b): %.2f' % lm1_y_intercept)
print('    coefficient (m): %.2f' % lm1_coefficients[0])