In [16]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

from acquire import get_telco_data
from telco_prep import drop_cols

In [17]:
df = get_telco_data()

In [18]:
'''
misc functions for working with the titanic and iris dbase
'''
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# binning and value_counts: 
def value_counts(dataframe):
    for col in dataframe.drop(columns=('customer_id')):
        if np.issubdtype(dataframe[col].dtype, np.number) and dataframe[col].nunique() > 10:
            print(dataframe[col].value_counts(bins=10, sort=False))
        else: 
            print(dataframe[col].value_counts(sort=False))


def handle_missing_values(dataframe):
    return dataframe.assign(
        total_charges = dataframe.total_charges.dropna(0)
    )

def churn_num(dataframe):
    return dataframe.assign(
        churn = dataframe['churn'].map({'No': 0, 'Yes': 1})

    )

def tenure_year(dataframe):
    dataframe['tenure_year'] = (dataframe.tenure / 12).astype(int) + 1
    return dataframe

choices_lns = [2, 1, 0]
choices = [3, 2, 1, 0]




def conditional_encodes(df):
    conditions_secback = [
    (df['online_security'] == 'Yes') & (df['online_backup'] == 'Yes'),
    (df['online_security'] == 'No') & (df['online_backup'] == 'Yes'), 
    (df['online_security'] == 'Yes') & (df['online_backup'] == 'No'),
    (df['online_security'] == 'No') & (df['online_backup'] == 'No')
    ]

    conditions_strm = [
    (df['streaming_tv'] == 'Yes') & (df['streaming_movies'] == 'Yes'),
    (df['streaming_tv'] == 'No') & (df['streaming_movies'] == 'Yes'), 
    (df['streaming_tv'] == 'Yes') & (df['streaming_movies'] == 'No'),
    (df['streaming_tv'] == 'No') & (df['streaming_movies'] == 'No')
    ]

    conditions_pdep = [
    (df['partner'] == 'Yes') & (df['dependents'] == 'Yes'),
    (df['partner'] == 'No') & (df['dependents'] == 'Yes'), 
    (df['partner'] == 'Yes') & (df['dependents'] == 'No'),
    (df['partner'] == 'No') & (df['dependents'] == 'No')
    ]

    conditions_lns = [
    (df['phone_service'] == 'Yes') & (df['multiple_lines'] == 'Yes'),
    (df['phone_service'] == 'Yes') & (df['multiple_lines'] == 'No'),
    (df['phone_service'] == 'No') & (df['multiple_lines'] == 'No')
    ]

    df['multiple_lines'] = np.select(conditions_lns, choices_lns)
    df['household_type_id'] = np.select(conditions_pdep, choices)
    df['streaming_services'] = np.select(conditions_strm, choices)
    df['online_security_backup'] = np.select(conditions_secback, choices)
    return df

def encode_gender(df):
    encoder=LabelEncoder()
    encoder.fit(df.gender)
    return df.assign(gender_e = encoder.transform(df.gender))

def encode_tech(df):
    encoder=LabelEncoder()
    encoder.fit(df.tech_support)
    return df.assign(tech_support_e = encoder.transform(df.tech_support))

def encode_paperless(df):
    encoder=LabelEncoder()
    encoder.fit(df.paperless_billing)
    return df.assign(paperless_billing_e = encoder.transform(df.paperless_billing))

def encode_device_protection(df):
    encoder=LabelEncoder()
    encoder.fit(df.device_protection)
    return df.assign(device_protection_e = encoder.transform(df.device_protection))

def format_totals(df):
    df['total_charges'] = df['total_charges'].convert_objects(convert_numeric=True)
    df.total_charges.dropna(0, inplace=True)
    return df

def drop_cols(df):
    return df.drop(columns=(['customer_id', 'partner', 'dependents', 'phone_service',
    'multiple_lines', 'online_security', 'online_backup',
    'streaming_tv', 'gender','streaming_movies', 'contract_type',
    'internet_service_type', 'payment_type', 'tech_support', 'paperless_billing', 'device_protection']))

def prep_telco_data(df):
    return df.pipe(handle_missing_values)\
    .pipe(churn_num)\
    .pipe(tenure_year)\
    .pipe(conditional_encodes)\
    .pipe(encode_gender)\
    .pipe(encode_device_protection)\
    .pipe(encode_tech)\
    .pipe(encode_paperless)\
    .pipe(format_totals)


In [19]:
df = prep_telco_data(df)

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.


In [20]:
df.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,...,internet_service_type,payment_type,tenure_year,household_type_id,streaming_services,online_security_backup,gender_e,device_protection_e,tech_support_e,paperless_billing_e
0,0003-MKNFE,Male,0,No,No,9,Yes,2,1,No,...,DSL,Mailed check,1,0,2,0,1,0,0,0
1,0013-MHZWF,Female,0,No,Yes,9,Yes,1,1,No,...,DSL,Credit card (automatic),1,2,3,0,0,0,2,1
2,0015-UOCOJ,Female,1,No,No,7,Yes,1,1,Yes,...,DSL,Electronic check,1,0,0,1,0,0,0,1
3,0023-HGHWL,Male,1,No,No,1,No,0,1,No,...,DSL,Electronic check,1,0,0,0,1,0,0,1
4,0032-PGELS,Female,0,Yes,Yes,1,No,0,1,Yes,...,DSL,Bank transfer (automatic),1,3,0,1,0,0,0,0


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 32 columns):
customer_id                 7043 non-null object
gender                      7043 non-null object
senior_citizen              7043 non-null int64
partner                     7043 non-null object
dependents                  7043 non-null object
tenure                      7043 non-null int64
phone_service               7043 non-null object
multiple_lines              7043 non-null int64
internet_service_type_id    7043 non-null int64
online_security             7043 non-null object
online_backup               7043 non-null object
device_protection           7043 non-null object
tech_support                7043 non-null object
streaming_tv                7043 non-null object
streaming_movies            7043 non-null object
contract_type_id            7043 non-null int64
paperless_billing           7043 non-null object
payment_type_id             7043 non-null int64
monthly_charges  

In [23]:
df = drop_cols(df)

In [24]:
df.head()

Unnamed: 0,senior_citizen,tenure,internet_service_type_id,contract_type_id,payment_type_id,monthly_charges,total_charges,churn,tenure_year,household_type_id,streaming_services,online_security_backup,gender_e,device_protection_e,tech_support_e,paperless_billing_e
0,0,9,1,1,2,59.9,542.4,0,1,0,2,0,1,0,0,0
1,0,9,1,1,4,69.4,571.45,0,1,2,3,0,0,0,2,1
2,1,7,1,1,1,48.2,340.35,0,1,0,0,1,0,0,0,1
3,1,1,1,1,1,25.1,25.1,1,1,0,0,0,1,0,0,1
4,0,1,1,1,3,30.5,30.5,1,1,3,0,1,0,0,0,0


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
senior_citizen              7043 non-null int64
tenure                      7043 non-null int64
internet_service_type_id    7043 non-null int64
contract_type_id            7043 non-null int64
payment_type_id             7043 non-null int64
monthly_charges             7043 non-null float64
total_charges               7032 non-null float64
churn                       7043 non-null int64
tenure_year                 7043 non-null int64
household_type_id           7043 non-null int64
streaming_services          7043 non-null int64
online_security_backup      7043 non-null int64
gender_e                    7043 non-null int64
device_protection_e         7043 non-null int64
tech_support_e              7043 non-null int64
paperless_billing_e         7043 non-null int64
dtypes: float64(2), int64(14)
memory usage: 880.5 KB


In [26]:
# 11: Numeric Scaling: scale the monthly_charges and total_charges data. 
# Make sure that the parameters for scaling are learned from the training data set.

# ~~~~~SPLIT~~~~~
# ~~This is also #9 ~~
X = df.drop(['churn'], axis = 1)
y = df[['churn']]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.70, random_state=123)

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

scaler = MinMaxScaler()
scaler.fit(train[['monthly_charges', 'total_charges']])

train[['monthly_charges', 'total_charges']] = scaler.transform(train[['monthly_charges', 'total_charges']])
test[['monthly_charges', 'total_charges']] = scaler.transform(test[['monthly_charges', 'total_charges']])




In [27]:
train.head()

Unnamed: 0,senior_citizen,tenure,internet_service_type_id,contract_type_id,payment_type_id,monthly_charges,total_charges,tenure_year,household_type_id,streaming_services,online_security_backup,gender_e,device_protection_e,tech_support_e,paperless_billing_e,churn
1479,0,52,1,2,1,0.502488,0.399729,5,3,1,2,1,2,2,1,0
2377,0,59,1,3,4,0.716915,0.59719,5,3,3,3,0,2,2,0,0
6613,0,46,3,3,3,0.0199,0.100571,4,3,0,0,0,1,1,0,0
6468,0,55,3,3,3,0.074129,0.164418,5,1,0,0,1,1,1,0,0
2668,0,10,2,1,1,0.61393,0.096746,1,1,1,0,1,0,0,1,1


In [None]:
# 10: Variable Encoding: encode the values in each non-numeric 
# feature such that they are numeric.