# Classification Project: What Causes Telco Churn?

## --- Pipeline phase 1: Planning: ---

 ### In this notebook we will explore where drivers for churn or attrition might be based on a dataset snapshot of customers for a telecommunications service.  We will investigate differences between groups of customers and what might make one group or class different from another, and if that has any bearing on their propensity to leave the company.
 
 ### Please reference data_dictionary.py for explicit details on features.

In [8]:
# setting up our environment: 

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
import graphviz
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier


from acquire import get_telco_data
from telco_prep import *
from matplotlib import cm
from matplotlib.ticker import FormatStrFormatter


from scipy.stats import ttest_ind as ttest
from scipy.stats import pearsonr

## --- Pipeline phase 2: Acquisition: ---

In [9]:
# call our function to pull our dataframe using mySQL:
df = get_telco_data()

## --- Pipeline Phase 3: Preparation: ---

In [10]:
# call function to prepare dataframe based on parameters outlined
# in prepare.py and curriculum instructions
df = prep_telco_data(df)

In [11]:
def drop_totals(df):
    df = df[pd.notna(df.total_charges)]
    return df
    


In [12]:
df = drop_totals(df)
train = drop_totals(train)
train.info()

NameError: name 'train' is not defined

## --- Pipeline phase 4: Exploration: ---

In [13]:
# Numeric Scaling: scale the monthly_charges and total_charges data. 
# Make sure that the parameters for scaling are learned from the training data set.

# split the dataframe
X = df.drop(['churn'], axis = 1)
y = df[['churn']]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.70, random_state=123)

#concatinate our X and y together to make a single test and train df
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

# scale our monthly and total charges
scaler = MinMaxScaler()
scaler.fit(train[['monthly_charges', 'total_charges']])

train[['monthly_charges', 'total_charges']] = scaler.transform(train[['monthly_charges', 'total_charges']])
test[['monthly_charges', 'total_charges']] = scaler.transform(test[['monthly_charges', 'total_charges']])


In [14]:
X_train = X_train.drop(columns=(['gender_e', 'phone_id', 'streaming_services', 'online_security_backup',
                                  'senior_citizen',]))
X_test = X_test.drop(columns=(['gender_e', 'phone_id', 'streaming_services', 'online_security_backup',
                                  'senior_citizen',]))

In [15]:
logit = LogisticRegression(C=1, class_weight='balanced', random_state=123, solver='newton-cg')

In [16]:
logit.fit(X_train, y_train)

ValueError: could not convert string to float: 'Electronic check'

In [17]:
print('Coefficient1: \n', logit.coef_)
print('Intercept1: \n', logit.intercept_)

AttributeError: 'LogisticRegression' object has no attribute 'coef_'

In [18]:
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

print('Accuracy of Logistic Regression classifier  on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

NotFittedError: This LogisticRegression instance is not fitted yet

In [19]:
print(confusion_matrix(y_train, y_pred))

tpp = confusion_matrix(y_train, y_pred)

print(tpp[1][1] / (tpp[1][1] + tpp[1][0]))


NameError: name 'y_pred' is not defined

In [20]:
print(classification_report(y_train, y_pred))

NameError: name 'y_pred' is not defined

In [21]:
print('Accuracy of Logistic Regression classifier on test set: {:.2f}'
     .format(logit.score(X_test, y_test)))

NotFittedError: This LogisticRegression instance is not fitted yet

In [None]:
y_pred_proba = [i[1] for i in y_pred_proba]
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(y_pred_proba, y_pred)