# Telco Project

## Project planning
- acquire Telco data via acquire.py file
- clean and prepare data via prepare.py file 
- define the hypothesis, set TP, TN, FP, FN
- establish baseline accuracy
- train classification models 
- choose the best performing model
- document conclusions

In [14]:
#ignore warnings
import warnings
warnings.filterwarnings("ignore")


#calculations
import numpy as np
import pandas as pd 
from scipy import stats

#dataset
from pydataset import data

#sklearn stuff 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

#graphing
import matplotlib.pyplot as plt
import seaborn as sns

#acquire and clean data
import acquire 
import prepare

### step 1: acquire the data 

In [2]:
df = acquire.get_telco_data()
df.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


### - data acquired successfully

### step 2: prep the data

In [4]:
df = prepare.prep_telco_data(df)

In [5]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7033,7034,7035,7036,7037,7038,7039,7040,7041,7042
gender,Female,Male,Male,Male,Female,Female,Female,Male,Female,Female,...,Male,Male,Female,Female,Female,Female,Male,Male,Male,Male
senior_citizen,0,0,0,1,1,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,0
partner,Yes,No,No,Yes,Yes,No,Yes,Yes,No,Yes,...,No,Yes,No,No,No,No,Yes,No,Yes,Yes
dependents,Yes,No,No,No,No,Yes,No,No,No,Yes,...,No,Yes,No,No,No,No,No,No,Yes,Yes
tenure,9,9,4,13,3,9,71,63,7,65,...,1,47,7,1,4,13,22,2,67,63
phone_service,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No
multiple_lines,No,Yes,No,No,No,No,No,Yes,No,Yes,...,No,No,No,No,No,No,Yes,No,No,No phone service
online_security,No,No,No,No,No,No,Yes,Yes,Yes,Yes,...,No internet service,No,No,No,No internet service,Yes,No,No,Yes,Yes
online_backup,Yes,No,No,Yes,No,No,Yes,No,No,Yes,...,No internet service,Yes,Yes,No,No internet service,No,No,Yes,No,Yes
device_protection,No,No,Yes,Yes,No,No,Yes,No,No,Yes,...,No internet service,No,No,No,No internet service,No,No,No,Yes,Yes


In [12]:
df.isna().sum()

gender                                   0
senior_citizen                           0
partner                                  0
dependents                               0
tenure                                   0
phone_service                            0
multiple_lines                           0
online_security                          0
online_backup                            0
device_protection                        0
tech_support                             0
streaming_tv                             0
streaming_movies                         0
paperless_billing                        0
monthly_charges                          0
total_charges                            0
churn                                    0
contract_type                            0
internet_service_type                    0
payment_type                             0
gender_Male                              0
partner_Yes                              0
dependents_Yes                           0
phone_servi

### - data prepared, no nulls reported

# Split the data 

In [18]:
def train_validate_test_split(df, target, seed=123):
 
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test


In [23]:

train, validate, test = train_validate_test_split(df, target='churn_Yes')
train.shape, validate.shape, test.shape


((3937, 45), (1688, 45), (1407, 45))

In [27]:
# Stratification means we'll get even proportions of the target variable in each data set
train.churn_Yes.mean(), validate.churn_Yes.mean(), test.churn_Yes.mean()


(0.2656845313690627, 0.2659952606635071, 0.2658137882018479)

### get the percent that churned from Train


In [34]:
churn_rate = round(train.churn_Yes.mean(), 4) * 100
print('the churn rate of the train data was' , churn_rate, '%')

the churn rate of the train data was 26.57 %
