In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import warnings

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_curve, auc, confusion_matrix

from regression_module import *

warnings.filterwarnings('ignore')

%matplotlib inline
%load_ext autoreload
%autoreload 2

# 4) Logistic Regression

In [None]:
# Read in data
df = pd.read_csv('data/reg_data.csv')

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# Remove unecessary columns from beginning of data ('Unnamed: 0', 'customerid')
df2 = df.iloc[:,2:]
df2.head(2)

In [None]:
# Replace churn string values with numeric binary values
df2.churn.replace({"Yes":1, "No":0}, inplace = True)
# Utilize pandas dummy variable function to create dummy variable series for categorical data
dummy_df = pd.get_dummies(df2)
dummy_df.to_csv('data/final_df.csv')
dummy_df.info()

# 4.1) Visualizing Correlations

In [None]:
# Plot correlations between our features and our target feature, churn, as a bar plot, sort by descending order
dummy_df.corr()['churn'].sort_values(ascending = False).plot('bar', figsize = (20, 10), color = 'Navy')
plt.title('Feature Correlation w/ Churn', fontsize = 30, fontweight = 'bold')
plt.xticks(fontsize = 15, fontweight = 'bold')
plt.yticks(fontweight = 'bold', fontsize = 12)
plt.savefig('pics/Regression_pics/corr.png')
plt.tight_layout()


In [None]:
# Visualize a heatmap for our features correlation values
plt.figure(figsize = (30, 15))
x = sns.heatmap(dummy_df.corr(), cmap = 'YlGnBu')

## From our heatmap and correlation barplot, we see that monthly contract, and a lack of online security or techsupport have the strongest *positive* correlation with churn. 2-Year Contracts and not having internet service are the most negatively correlated with churn. 

# 4.2) Pre-Processing Data for Logistic Regression

In [None]:
# Establish our target feature, churn, as our y feature
y = dummy_df.churn.values
# Drop our target feature from our features dataframe
X = dummy_df.drop('churn', axis = 1)
# Save dataframe column titles to list for reassigning after min max scale 
cols = X.columns

# 4.3) Scaling our Data

In [None]:
# Instantiate min-max scaling object
mm = MinMaxScaler()
# Fit and transform our feature dataframe
X = pd.DataFrame(mm.fit_transform(X))
# Reassign column names so new dataframe has corresponding names
X.columns = cols

# 4.4) Train Test Split

In [None]:
# Perform train test split from our target feature series and the predicting feature dataframe (set test size to 25% of data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 33)

# 4.5) Building the Model

In [None]:
# Instantiate a Logistic Regression model without an intercept. C is set to an arbitrarily large number. Use 'liblinear' solver method.
logreg = LogisticRegression(fit_intercept = False, C = 1e12, solver = 'liblinear')
# Fit the model to our X and y training sets
logreg.fit(X_train, y_train)

In [None]:
# Generate model prediction data for train and test sets
y_hat_train = logreg.predict(X_train)
y_hat_test = logreg.predict(X_test)

# 4.6) Evaluating Model Performance

## How many times was the classifier correct on the training set?

In [None]:
# Find residual differences between train data and predicted train data
residuals = np.abs(y_train - y_hat_train)
# Print value counts of our predicted values 
print(pd.Series(residuals).value_counts())
print('----------------------------------')
# Print normalized value counts of our predicted values
print(pd.Series(residuals).value_counts(normalize = True))

## **Train Set Results:**
* 4270 Correct (1012 Incorrect)
* 80.8 % Accuracy

## How many times was the classifier correct on the test set?


In [None]:
# Repeat previous step with test data
residuals = np.abs(y_test - y_hat_test)
print(pd.Series(residuals).value_counts())
print('---------------------------------')
print(pd.Series(residuals).value_counts(normalize = True))

## **Test Set Results:**
* 1409 Correct (352 Incorrect)
* 80.01% Accuracy

## Confusion Matrix

In [None]:
# Call confusion_matrix function from sklearn.metrics using actual y_test and predicted y_test data sets 
cnf_matrix = confusion_matrix(y_test, y_hat_test)
print('Confusion Matrix: \n', cnf_matrix)

In [None]:
# Print 4 main logistic model metrics for training and test sets (Precision, Recall, Accuracy, F1)
print_metrics(y_train, y_hat_train, y_test, y_hat_test)

In [None]:
# Print residual scatter plot for 4 main logistic model metrics, iterating through the model and passing multiple 
# test-size objects to visualize effects of train/test size on model performance
print_metric_comparisons(X, y)

## Plot ROC Curves
We will now calculate probability scores for our y-train and y-test datasets. We use the false positive and true positive rates to plot a ROC Curve (receiver operating characteristic curve) for both train and test sets.

In [None]:
plot_auc(logreg, X_train, X_test, y_train, y_test)