In [4]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import graphviz
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")
import prep

In [5]:
# Get churn data from customers table
df = pd.read_csv('telco_churn_customers.csv')
df = prep.prep_telco(df)

In [6]:
df.head()

Unnamed: 0,customer_id,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,online_backup,...,total_charges,churn,tenure_year,household_type_id,streaming_services,phone_id,gender,device_protection,tech_support,paperless_billing
0,0002-ORFBO,0,Yes,Yes,9,Yes,No,1,No,Yes,...,593.3,0,0.75,3,3,1,0,0,2,1
1,0003-MKNFE,0,No,No,9,Yes,Yes,1,No,No,...,542.4,0,0.75,0,0,2,1,0,0,0
2,0004-TLHLJ,0,No,No,4,Yes,No,2,No,No,...,280.85,1,0.33,0,0,1,1,2,0,1
3,0011-IGKFF,1,Yes,No,13,Yes,No,2,No,Yes,...,1237.85,1,1.08,2,2,1,1,2,0,1
4,0013-EXCHZ,1,Yes,No,3,Yes,No,2,No,No,...,267.4,1,0.25,2,2,1,0,0,2,1


Write a function, peekatdata(dataframe), that takes a dataframe as input and computes and returns the following:

creates dataframe object head_df (df of the first 5 rows) and prints contents to screen

creates dataframe object tail_df (df of the last 5 rows) and prints contents to screen

creates tuple object shape_tuple (tuple of (nrows, ncols)) and prints tuple to screen

creates dataframe object describe_df (summary statistics of all numeric variables) and prints contents to screen.

prints to screen the information about a DataFrame including the index dtype and column dtypes, non-null values and memory usage.

In [7]:
def peekatdata(dataframe):
    head_df = df.head()
    tail_df = df.tail()
    head_tail = head_df.append(tail_df)
    print('Head and Tail\n\n', head_tail)
    print('--'*55)
    shape_tuple = df.shape
    print('Shape\n\n', shape_tuple)
    print('--'*20)
    describe_df = df.describe()
    print('Describe\n\n', describe_df)
    print('--'*20)
    print('Index\n\n', df.index)
    print('--'*20)
    print('Data Types\n\n', df.dtypes)
    print('--'*20)
    print('Null Value Count\n\n', df.isnull().sum())
    print('--'*55)
    print('Memory\n\n', df.memory_usage)

In [8]:
def df_value_counts(df):
    for col in df.columns: 
        n = df[col].unique().shape[0] 
        col_bins = min(n,10) 
        if df[col].dtype in ['int64','float64'] and n > 10:
            print('%s:' % col)
            print(df[col].value_counts(bins=col_bins, sort=False)) 
        else: 
            print(df[col].value_counts()) 
        print('\n')

# df_value_counts(df)

In [9]:
# Split the data into train (70%) & test (30%) samples.
X = df.drop(['churn'], axis=1)
y = df[['churn']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=123)

X_train.head()

Unnamed: 0,customer_id,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,online_backup,...,monthly_charges,total_charges,tenure_year,household_type_id,streaming_services,phone_id,gender,device_protection,tech_support,paperless_billing
1479,2187-PKZAY,0,No,No,12,Yes,No,2,No,No,...,79.95,1043.4,1.0,0,0,1,1,0,0,1
2377,3402-XRIUO,1,Yes,No,22,Yes,Yes,1,Yes,Yes,...,63.55,1381.8,1.83,2,2,2,0,0,2,1
6613,9397-TZSHA,0,No,No,69,Yes,Yes,3,No internet service,No internet service,...,24.6,1678.05,5.75,0,0,2,0,1,1,0
6468,9153-BTBVV,0,Yes,No,71,Yes,Yes,3,No internet service,No internet service,...,25.0,1753.0,5.92,2,2,2,0,1,1,0
2668,3793-MMFUH,1,No,No,13,Yes,Yes,2,No,No,...,95.05,1290.0,1.08,0,0,2,0,0,0,1


In [10]:
# Numeric Scaling: scale the monthly_charges and total_charges data. Make sure that the parameters for scaling 
# are learned from the training data set.
scaler = MinMaxScaler()
scaler.fit(X_train[['monthly_charges', 'total_charges']])

X_train[['monthly_charges', 'total_charges']] = scaler.transform(X_train[['monthly_charges', 'total_charges']])
X_test[['monthly_charges', 'total_charges']] = scaler.transform(X_test[['monthly_charges', 'total_charges']])

In [11]:
X_train.head()

Unnamed: 0,customer_id,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,online_backup,...,monthly_charges,total_charges,tenure_year,household_type_id,streaming_services,phone_id,gender,device_protection,tech_support,paperless_billing
1479,2187-PKZAY,0,No,No,12,Yes,No,2,No,No,...,0.61393,0.120141,1.0,0,0,1,1,0,0,1
2377,3402-XRIUO,1,Yes,No,22,Yes,Yes,1,Yes,Yes,...,0.450746,0.159106,1.83,2,2,2,0,0,2,1
6613,9397-TZSHA,0,No,No,69,Yes,Yes,3,No internet service,No internet service,...,0.063184,0.193217,5.75,0,0,2,0,1,1,0
6468,9153-BTBVV,0,Yes,No,71,Yes,Yes,3,No internet service,No internet service,...,0.067164,0.201847,5.92,2,2,2,0,1,1,0
2668,3793-MMFUH,1,No,No,13,Yes,Yes,2,No,No,...,0.764179,0.148535,1.08,0,0,2,0,0,0,1
