In [1]:
import pandas as pd
import numpy as np
import os

# visualize
import seaborn as sns


# turn off pink warning boxes
import warnings
warnings.filterwarnings("ignore")

# acquire
from pydataset import data

In [2]:
# *************************************  connection url **********************************************

# Create helper function to get the necessary connection url.
def get_connection(db_name):
    '''
    This function uses my info from my env file to
    create a connection url to access the Codeup db.
    '''
    from env import host, username, password
    return f'mysql+pymysql://{username}:{password}@{host}/{db_name}'


In [3]:
# acquire data

In [4]:
#acquire data for the first time
def get_new_telco_churn():
    '''
    This function reads in the telco_churn data from the Codeup db
    and returns a pandas DataFrame with all columns and joined with other tables.
    '''
    sql_query = '''
    SELECT * FROM customers
    JOIN contract_types USING (contract_type_id)
    JOIN internet_service_types USING (internet_service_type_id)
    JOIN payment_types USING (payment_type_id)
    '''
    return pd.read_sql(sql_query, get_connection('telco_churn'))


In [5]:
#acquire data 
def get_telco_churn():
    '''
    This function reads in telco_churn data from Codeup database, writes data to
    a csv file if a local file does not exist, and returns a df.
    '''
    if os.path.isfile('telco_churn.csv'):
        
        # If csv file exists, read in data from csv file.
        df = pd.read_csv('telco_churn.csv', index_col=0)
        
    else:
        
        # Read fresh data from db into a DataFrame.
        df = get_new_telco_churn()
        
        # Write DataFrame to a csv file.
        df.to_csv('telco_churn.csv')
        
    return df

In [6]:
#acquire data with my function
df= get_telco_churn()

In [7]:
# check the information before preparation process
df.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,3,0016-QLJIS,Female,0,Yes,Yes,65,Yes,...,Yes,Yes,Yes,Yes,90.45,5957.9,No,Two year,DSL,Mailed check
1,4,1,3,0017-DINOC,Male,0,No,No,54,No,...,Yes,Yes,No,No,45.2,2460.55,No,Two year,DSL,Credit card (automatic)
2,3,1,3,0019-GFNTW,Female,0,No,No,56,No,...,Yes,No,No,No,45.05,2560.1,No,Two year,DSL,Bank transfer (automatic)
3,4,1,3,0056-EPFBG,Male,0,Yes,Yes,20,No,...,Yes,No,No,Yes,39.4,825.4,No,Two year,DSL,Credit card (automatic)
4,3,1,3,0078-XZMHT,Male,0,Yes,No,72,Yes,...,Yes,Yes,Yes,Yes,85.15,6316.2,No,Two year,DSL,Bank transfer (automatic)


In [8]:
df.shape

(7043, 24)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   gender                    7043 non-null   object 
 5   senior_citizen            7043 non-null   int64  
 6   partner                   7043 non-null   object 
 7   dependents                7043 non-null   object 
 8   tenure                    7043 non-null   int64  
 9   phone_service             7043 non-null   object 
 10  multiple_lines            7043 non-null   object 
 11  online_security           7043 non-null   object 
 12  online_backup             7043 non-null   object 
 13  device_protection         7043 non-null   object 
 14  tech_sup

# preparation 

In [10]:
#checking nulls
df.isnull().sum()

payment_type_id             0
internet_service_type_id    0
contract_type_id            0
customer_id                 0
gender                      0
senior_citizen              0
partner                     0
dependents                  0
tenure                      0
phone_service               0
multiple_lines              0
online_security             0
online_backup               0
device_protection           0
tech_support                0
streaming_tv                0
streaming_movies            0
paperless_billing           0
monthly_charges             0
total_charges               0
churn                       0
contract_type               0
internet_service_type       0
payment_type                0
dtype: int64

In [11]:
df.isna().sum()

payment_type_id             0
internet_service_type_id    0
contract_type_id            0
customer_id                 0
gender                      0
senior_citizen              0
partner                     0
dependents                  0
tenure                      0
phone_service               0
multiple_lines              0
online_security             0
online_backup               0
device_protection           0
tech_support                0
streaming_tv                0
streaming_movies            0
paperless_billing           0
monthly_charges             0
total_charges               0
churn                       0
contract_type               0
internet_service_type       0
payment_type                0
dtype: int64

In [12]:
#checking the type of  the columns. I notice total_charges is an object type
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   gender                    7043 non-null   object 
 5   senior_citizen            7043 non-null   int64  
 6   partner                   7043 non-null   object 
 7   dependents                7043 non-null   object 
 8   tenure                    7043 non-null   int64  
 9   phone_service             7043 non-null   object 
 10  multiple_lines            7043 non-null   object 
 11  online_security           7043 non-null   object 
 12  online_backup             7043 non-null   object 
 13  device_protection         7043 non-null   object 
 14  tech_sup

In [13]:
#we cannot conver total_charges to float because there are ' '
df['total_charges'].astype('float')

ValueError: could not convert string to float: ''

In [14]:
#take a look at the rows with space
df['total_charges'][df['total_charges']== ' ']

85       
156      
236      
255      
339      
5681     
5717     
5727     
5798     
6007     
6257     
Name: total_charges, dtype: object

In [16]:
#we will replace space with '0' for only these columns
df['total_charges'][df['total_charges']== ' ']= df['total_charges'][df['total_charges']== ' '].replace(' ', '0')

In [17]:
#checking if we replace it
df['total_charges'][df['total_charges']== ' ']

Series([], Name: total_charges, dtype: object)

In [None]:
#in order to conver total_charges to float, I need will add '0' to ' '
#df['total_charges'] = df['total_charges'] + '0'

In [18]:
#convert to a float type
df['total_charges'] = df['total_charges'].astype('float')
df['total_charges'].dtype

dtype('float64')

In [19]:
#checking for duplicates
df.duplicated().sum()

0

In [20]:
#checking again
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   gender                    7043 non-null   object 
 5   senior_citizen            7043 non-null   int64  
 6   partner                   7043 non-null   object 
 7   dependents                7043 non-null   object 
 8   tenure                    7043 non-null   int64  
 9   phone_service             7043 non-null   object 
 10  multiple_lines            7043 non-null   object 
 11  online_security           7043 non-null   object 
 12  online_backup             7043 non-null   object 
 13  device_protection         7043 non-null   object 
 14  tech_sup

In [21]:
#check the columns yes?no columns
df.head(1)

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,3,0016-QLJIS,Female,0,Yes,Yes,65,Yes,...,Yes,Yes,Yes,Yes,90.45,5957.9,No,Two year,DSL,Mailed check


In [22]:
#all the columns that have yes/no
col_list = ['partner', 'dependents','phone_service', 'paperless_billing','churn' ]

In [23]:
#check all the values for these columns
for col in col_list:
    print(col)
    print(df[col].value_counts())
    print('__________________________')

partner
No     3641
Yes    3402
Name: partner, dtype: int64
__________________________
dependents
No     4933
Yes    2110
Name: dependents, dtype: int64
__________________________
phone_service
Yes    6361
No      682
Name: phone_service, dtype: int64
__________________________
paperless_billing
Yes    4171
No     2872
Name: paperless_billing, dtype: int64
__________________________
churn
No     5174
Yes    1869
Name: churn, dtype: int64
__________________________


In [24]:
#checking if we can change no/yes to 0/1 and compare above
(df.churn == 'Yes').astype(int).value_counts()

0    5174
1    1869
Name: churn, dtype: int64

In [25]:
#using this code we can change it
(df[col_list] == 'Yes').astype(int)

Unnamed: 0,partner,dependents,phone_service,paperless_billing,churn
0,1,1,1,1,0
1,0,0,0,0,0
2,0,0,0,0,0
3,1,1,0,1,0
4,1,0,1,1,0
...,...,...,...,...,...
7038,1,1,1,0,0
7039,0,0,1,0,0
7040,0,0,1,0,0
7041,0,0,1,0,0


In [28]:
# changgin the value no/yes to 0/1
df[col_list] = (df[col_list] == 'Yes').astype(int)

In [29]:
df[col_list].head()

Unnamed: 0,partner,dependents,phone_service,paperless_billing,churn
0,1,1,1,1,0
1,0,0,0,0,0
2,0,0,0,0,0
3,1,1,0,1,0
4,1,0,1,1,0


In [30]:
#check all the values for these columns to make sure it is correct
for col in col_list:
    print(col)
    print(df[col].value_counts(dropna = False))
    print('__________________________')

partner
0    3641
1    3402
Name: partner, dtype: int64
__________________________
dependents
0    4933
1    2110
Name: dependents, dtype: int64
__________________________
phone_service
1    6361
0     682
Name: phone_service, dtype: int64
__________________________
paperless_billing
1    4171
0    2872
Name: paperless_billing, dtype: int64
__________________________
churn
0    5174
1    1869
Name: churn, dtype: int64
__________________________


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   gender                    7043 non-null   object 
 5   senior_citizen            7043 non-null   int64  
 6   partner                   7043 non-null   int64  
 7   dependents                7043 non-null   int64  
 8   tenure                    7043 non-null   int64  
 9   phone_service             7043 non-null   int64  
 10  multiple_lines            7043 non-null   object 
 11  online_security           7043 non-null   object 
 12  online_backup             7043 non-null   object 
 13  device_protection         7043 non-null   object 
 14  tech_sup

In [32]:
#selecting all the columns that are object type
df.select_dtypes('object').columns

Index(['customer_id', 'gender', 'multiple_lines', 'online_security',
       'online_backup', 'device_protection', 'tech_support', 'streaming_tv',
       'streaming_movies', 'contract_type', 'internet_service_type',
       'payment_type'],
      dtype='object')

In [None]:
#make a list of these type of columns , excluding 'customer_id'
list(df.select_dtypes('object').columns)[1:]

In [33]:
#checking the value of each columns
col_list = list(df.select_dtypes('object').columns)[1:]
for col in col_list:
    print(col)
    print(df[col].value_counts())
    print('__________________________')

gender
Male      3555
Female    3488
Name: gender, dtype: int64
__________________________
multiple_lines
No                  3390
Yes                 2971
No phone service     682
Name: multiple_lines, dtype: int64
__________________________
online_security
No                     3498
Yes                    2019
No internet service    1526
Name: online_security, dtype: int64
__________________________
online_backup
No                     3088
Yes                    2429
No internet service    1526
Name: online_backup, dtype: int64
__________________________
device_protection
No                     3095
Yes                    2422
No internet service    1526
Name: device_protection, dtype: int64
__________________________
tech_support
No                     3473
Yes                    2044
No internet service    1526
Name: tech_support, dtype: int64
__________________________
streaming_tv
No                     2810
Yes                    2707
No internet service    1526
Name: streamin

In [35]:
#the columns 'online_security','online_backup','device_protection','tech_support','streaming_tv','streaming_movies'
#have the same values 
col_list[2:8]

['online_security',
 'online_backup',
 'device_protection',
 'tech_support',
 'streaming_tv',
 'streaming_movies']

In [36]:
#create a dicttionary that have the values that we are going to change
var= {
    'No':0,
    'Yes':1,
    'No internet service':3
}

In [37]:
#using .map and for loop to change the values of each column on the list

In [38]:
for col in col_list[2:8]:
      df[col]= df[col].map(var) 


In [39]:
#check the changes
df[col_list[2:8]].head()

Unnamed: 0,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies
0,1,1,1,1,1,1
1,1,0,0,1,1,0
2,1,1,1,1,0,0
3,1,0,1,1,0,0
4,0,1,1,1,1,1


In [41]:
#checking again the list of columns as 'object type'
df.select_dtypes('object').columns

Index(['customer_id', 'gender', 'multiple_lines', 'contract_type',
       'internet_service_type', 'payment_type'],
      dtype='object')

In [42]:
#I decide to convert this column to a values
df['multiple_lines'].unique()

array(['Yes', 'No phone service', 'No'], dtype=object)

In [43]:
#replace the values for numeric
df.replace({'multiple_lines': {'No':1, 'Yes':2, 'No phone service': 0}}, inplace=True)


In [44]:
df['multiple_lines'].unique()

array([2, 0, 1])

In [47]:
# make a list of the columns that are object type , exclude "customer_id"
col_list = list(df.select_dtypes('object').columns)[1:]

In [48]:
col_list

['gender', 'contract_type', 'internet_service_type', 'payment_type']

In [50]:
#create a dummy df of each column on the list
for col in col_list:
    dummy_df = pd.get_dummies(df[col], drop_first=True)
    df = pd.concat([df, dummy_df], axis=1)

In [51]:
# drop the columns that we already use to create dummy_df
df.drop(columns=col_list, inplace=True)

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   senior_citizen            7043 non-null   int64  
 5   partner                   7043 non-null   int64  
 6   dependents                7043 non-null   int64  
 7   tenure                    7043 non-null   int64  
 8   phone_service             7043 non-null   int64  
 9   multiple_lines            7043 non-null   int64  
 10  online_security           7043 non-null   int64  
 11  online_backup             7043 non-null   int64  
 12  device_protection         7043 non-null   int64  
 13  tech_support              7043 non-null   int64  
 14  streamin

In [53]:
#there is a column named None, that means no internet service. 
#0 = false (no internet service)
#1 = true (no internet service)

df['None'].value_counts()

0    5517
1    1526
Name: None, dtype: int64

In [54]:
# I will rename the column as has_internet
df.rename(columns={'None':'has_internet'}, inplace= True )

In [58]:
#checking the change
df['has_internet'].value_counts()

0    5517
1    1526
Name: has_internet, dtype: int64

In [59]:
#to match the values we want 
#0  = false (has internet)
#1 = true (has_internet)
df['has_internet'] = df['has_internet'].replace({0: 1, 1: 0})

In [60]:
df['has_internet'].value_counts()

1    5517
0    1526
Name: has_internet, dtype: int64

In [89]:
#creating my function

def clean_data(df):
    '''
    This function will drop payment_type_id', 'internet_service_type_id','contract_type_id', 
    convert all the columns that have yes/no to 0/1, 
    create dummy vars from 'gender', 'contract_type', 'internet_service_type', 'payment_type',
    change total_charges to a float type. 
    '''

    #clean data
    # conver total_charges to float
    df['total_charges'][df['total_charges']== ' ']= df['total_charges'][df['total_charges']== ' '].replace(' ', '0')
    df['total_charges'] = df['total_charges'].astype('float')
    
    #convert all the columns that have yes/no to 0/1
    col_list = ['partner', 'dependents','phone_service', 'paperless_billing','churn' ]
    df[col_list] = (df[col_list] == 'Yes').astype(int)
    
    #change columns to 0,1,2
    #getting a list of the  columns that I want to change
    col_list = list(df.select_dtypes('object').columns)[1:]
    #create a dicttionary to change the value
    var= {
        'No':0,
        'Yes':1,
        'No internet service':3
    }
    #use a for loop to change every column
    for col in col_list[2:8]:
      df[col]= df[col].map(var) 
    
    #replace the values of multiple_lines
    df.replace({'multiple_lines': {'No':1, 'Yes':2, 'No phone service': 0}}, inplace=True)
    
    #create a dummy df
    col_list = list(df.select_dtypes('object').columns)[1:]
    #create a dummy df
    for col in col_list:
        dummy_df = pd.get_dummies(df[col])
         ## Concatenate the dummy_df dataframe above with the original df
        df = pd.concat([df, dummy_df], axis=1)
    # drop the columns that we already use to create dummy_df
    df.drop(columns=col_list, inplace=True)
    
    #drop duplicates columns
    df.drop(columns = ['payment_type_id', 'internet_service_type_id','contract_type_id'], inplace=True)
    
    #  rename the column as has_internet
    df.rename(columns={'None':'has_internet'}, inplace= True )
    #changing the values to undestand better the meaning
    df['has_internet'] = df['has_internet'].replace({0: 1, 1: 0})
    # columns name change
    df_clean.columns = [col.lower().replace(' ', '_').replace('-','_') for col in df_clean]
    df_clean.columns
    return df


In [122]:
#acquire data
df = get_telco_churn()

In [123]:
df.shape

(7043, 24)

In [124]:
df.columns

Index(['payment_type_id', 'internet_service_type_id', 'contract_type_id',
       'customer_id', 'gender', 'senior_citizen', 'partner', 'dependents',
       'tenure', 'phone_service', 'multiple_lines', 'online_security',
       'online_backup', 'device_protection', 'tech_support', 'streaming_tv',
       'streaming_movies', 'paperless_billing', 'monthly_charges',
       'total_charges', 'churn', 'contract_type', 'internet_service_type',
       'payment_type'],
      dtype='object')

In [125]:
#use my clean function
df_clean = clean_data(df)

In [126]:
df_clean.shape

(7043, 29)

In [119]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customer_id                7043 non-null   object 
 1   senior_citizen             7043 non-null   int64  
 2   partner                    7043 non-null   int64  
 3   dependents                 7043 non-null   int64  
 4   tenure                     7043 non-null   int64  
 5   phone_service              7043 non-null   int64  
 6   multiple_lines             7043 non-null   int64  
 7   online_security            7043 non-null   int64  
 8   online_backup              7043 non-null   int64  
 9   device_protection          7043 non-null   int64  
 10  tech_support               7043 non-null   int64  
 11  streaming_tv               7043 non-null   int64  
 12  streaming_movies           7043 non-null   int64  
 13  paperless_billing          7043 non-null   int64

In [132]:
# columns name change
df_clean.columns = [col.lower().replace(' ', '_').replace('-','_') for col in df_clean]
df_clean.columns

Index(['customer_id', 'senior_citizen', 'partner', 'dependents', 'tenure',
       'phone_service', 'multiple_lines', 'online_security', 'online_backup',
       'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies',
       'paperless_billing', 'monthly_charges', 'total_charges', 'churn',
       'female', 'male', 'month_to_month', 'one_year', 'two_year', 'dsl',
       'fiber_optic', 'has_internet', 'bank_transfer_(automatic)',
       'credit_card_(automatic)', 'electronic_check', 'mailed_check'],
      dtype='object')

In [133]:
#calculate corr
df_corr = df_clean.corr()

In [134]:
df_corr

Unnamed: 0,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,...,month_to_month,one_year,two_year,dsl,fiber_optic,has_internet,bank_transfer_(automatic),credit_card_(automatic),electronic_check,mailed_check
senior_citizen,1.0,0.016479,-0.211185,0.016567,0.008576,0.113791,-0.208709,-0.170002,-0.172926,-0.217566,...,0.13836,-0.046262,-0.117,-0.108322,0.255338,0.182742,-0.016159,-0.024135,0.171718,-0.153477
partner,0.016479,1.0,0.452676,0.379697,0.017706,0.117307,0.056157,0.05954,0.064584,0.04742,...,-0.280865,0.082783,0.248091,-0.000851,0.000304,-0.000615,0.110706,0.082029,-0.083852,-0.095125
dependents,-0.211185,0.452676,1.0,0.159712,-0.001762,-0.019657,0.179614,0.161106,0.157003,0.173036,...,-0.23172,0.068368,0.204613,0.05201,-0.165818,-0.139812,0.052021,0.060267,-0.150642,0.059071
tenure,0.016567,0.379697,0.159712,1.0,0.008448,0.258958,0.0855,0.107643,0.107656,0.084902,...,-0.645561,0.20257,0.558533,0.013274,0.01972,0.039062,0.24351,0.233006,-0.208363,-0.233852
phone_service,0.008576,0.017706,-0.001762,0.008448,1.0,0.67507,0.146522,0.16454,0.156631,0.145215,...,-0.000742,-0.002791,0.003519,-0.452425,0.289999,-0.172209,0.007556,-0.007721,0.003062,-0.003319
multiple_lines,0.113791,0.117307,-0.019657,0.258958,0.67507,1.0,-0.074857,-0.034674,-0.038667,-0.074777,...,-0.068119,-0.004199,0.083266,-0.361806,0.414749,0.08256,0.061513,0.04259,0.065663,-0.176117
online_security,-0.208709,0.056157,0.179614,0.0855,0.146522,-0.074857,1.0,0.884432,0.8832,0.902492,...,-0.327446,0.079135,0.305769,-0.278888,-0.505673,-0.930746,0.034667,0.045975,-0.3456,0.309323
online_backup,-0.170002,0.05954,0.161106,0.107643,0.16454,-0.034674,0.884432,1.0,0.879558,0.885567,...,-0.304844,0.07595,0.282497,-0.346101,-0.434998,-0.923087,0.033914,0.038887,-0.308409,0.275201
device_protection,-0.172926,0.064584,0.157003,0.107656,0.156631,-0.038667,0.8832,0.879558,1.0,0.892017,...,-0.33024,0.0837,0.304677,-0.350834,-0.43056,-0.923197,0.03226,0.047479,-0.309516,0.269657
tech_support,-0.217566,0.04742,0.173036,0.084902,0.145215,-0.074777,0.902492,0.885567,0.892017,1.0,...,-0.343202,0.077659,0.325508,-0.281927,-0.50232,-0.93021,0.037218,0.046797,-0.34713,0.307727


In [98]:
df_corr.shape

(28, 28)

In [99]:
# I just wat to see churn
df_corr['churn']

senior_citizen               0.150889
partner                     -0.150448
dependents                  -0.164221
tenure                      -0.352229
phone_service                0.011942
multiple_lines               0.036310
online_security             -0.307989
online_backup               -0.280770
device_protection           -0.273951
tech_support                -0.306032
streaming_tv                -0.223123
streaming_movies            -0.224214
paperless_billing            0.191825
monthly_charges              0.193356
total_charges               -0.198324
churn                        1.000000
Female                       0.008612
Male                        -0.008612
Month-to-month               0.405103
One year                    -0.177820
Two year                    -0.302253
DSL                         -0.124214
Fiber optic                  0.308020
has_internet                 0.227890
Bank transfer (automatic)   -0.117937
Credit card (automatic)     -0.134302
Electronic c

In [101]:
df_corr['churn'].sort_values(ascending=False)

churn                        1.000000
Month-to-month               0.405103
Fiber optic                  0.308020
Electronic check             0.301919
has_internet                 0.227890
monthly_charges              0.193356
paperless_billing            0.191825
senior_citizen               0.150889
multiple_lines               0.036310
phone_service                0.011942
Female                       0.008612
Male                        -0.008612
Mailed check                -0.091683
Bank transfer (automatic)   -0.117937
DSL                         -0.124214
Credit card (automatic)     -0.134302
partner                     -0.150448
dependents                  -0.164221
One year                    -0.177820
total_charges               -0.198324
streaming_tv                -0.223123
streaming_movies            -0.224214
device_protection           -0.273951
online_backup               -0.280770
Two year                    -0.302253
tech_support                -0.306032
online_secur

In [104]:
df_clean.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
senior_citizen,7043.0,0.162147,0.368612,0.0,0.0,0.0,0.0,1.0
partner,7043.0,0.483033,0.499748,0.0,0.0,0.0,1.0,1.0
dependents,7043.0,0.299588,0.45811,0.0,0.0,0.0,1.0,1.0
tenure,7043.0,32.371149,24.559481,0.0,9.0,29.0,55.0,72.0
phone_service,7043.0,0.903166,0.295752,0.0,1.0,1.0,1.0,1.0
multiple_lines,7043.0,1.325004,0.64273,0.0,1.0,1.0,2.0,2.0
online_security,7043.0,0.936675,1.165986,0.0,0.0,1.0,1.0,3.0
online_backup,7043.0,0.994889,1.142491,0.0,0.0,1.0,1.0,3.0
device_protection,7043.0,0.993895,1.142921,0.0,0.0,1.0,1.0,3.0
tech_support,7043.0,0.940224,1.16465,0.0,0.0,1.0,1.0,3.0


In [108]:
df_clean['Female'].sum()

3488

In [109]:
df_clean['Male'].sum()

3555

In [110]:
df_clean['senior_citizen'].sum()

1142

In [113]:
df_clean.columns()

Index(['customer_id', 'senior_citizen', 'partner', 'dependents', 'tenure',
       'phone_service', 'multiple_lines', 'online_security', 'online_backup',
       'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies',
       'paperless_billing', 'monthly_charges', 'total_charges', 'churn',
       'Female', 'Male', 'Month-to-month', 'One year', 'Two year', 'DSL',
       'Fiber optic', 'has_internet', 'Bank transfer (automatic)',
       'Credit card (automatic)', 'Electronic check', 'Mailed check'],
      dtype='object')