In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder


from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score


from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [2]:
df= pd.read_csv('1_-_Project_Data.csv')

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
df.head()

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,No,No,Yes,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,No,No,Yes,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,No,Yes,Yes,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,No,No,Yes,49,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,Competitor had better devices


In [5]:
df.shape

(7043, 31)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         7043 non-null   object 
 1   Count              7043 non-null   int64  
 2   Country            7043 non-null   object 
 3   State              7043 non-null   object 
 4   City               7043 non-null   object 
 5   Zip Code           7043 non-null   int64  
 6   Lat Long           7043 non-null   object 
 7   Latitude           7043 non-null   float64
 8   Longitude          7043 non-null   float64
 9   Gender             7043 non-null   object 
 10  Senior Citizen     7043 non-null   object 
 11  Partner            7043 non-null   object 
 12  Dependents         7043 non-null   object 
 13  Tenure Months      7043 non-null   int64  
 14  Phone Service      7043 non-null   object 
 15  Multiple Lines     7043 non-null   object 
 16  Internet Service   7043 

In [30]:
for x in list(df.columns):
    print(x)
    print(len(df[x].unique()))

CustomerID
7043
City Number
1129
Zip Code
1652
Lat Long
1652
Latitude
1652
Longitude
1651
Gender
2
Senior Citizen
2
Partner
2
Dependents
2
Tenure Months
73
Phone Service
2
Multiple Lines
3
Internet Service
3
Online Security
3
Online Backup
3
Device Protection
3
Tech Support
3
Streaming TV
3
Streaming Movies
3
Contract
3
Paperless Billing
2
Payment Method
4
Monthly Charges
1585
Total Charges
6531
Churn Value
2


In [8]:
def train_encode(df):
    le = LabelEncoder()
    le.fit(df['City'])
    return le

In [9]:
le = train_encode(df)

In [19]:
def cleaning_fct(df):
    df['Gender'] = df['Gender'].map({'Female':0, 'Male':1})
    df['Senior Citizen'] = df['Senior Citizen'].map({'No':0, 'Yes':1})
    df['Partner'] = df['Partner'].map({'No':0, 'Yes':1})
    df['Dependents'] = df['Dependents'].map({'No':0, 'Yes':1})
    df['Phone Service'] = df['Phone Service'].map({'No':0, 'Yes':1})
    df['Multiple Lines'] = df['Multiple Lines'].map({'No':0, 'Yes':1, 'No phone service':2})
    df['Internet Service'] = df['Internet Service'].map({'DSL':0, 'Fiber optic':1, 'No':2})
    df['Online Security'] = df['Online Security'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Online Backup'] = df['Online Backup'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Device Protection'] = df['Device Protection'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Tech Support'] = df['Tech Support'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Streaming TV'] = df['Streaming TV'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Streaming Movies'] = df['Streaming Movies'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Contract'] = df['Contract'].map({'Month-to-month':0, 'Two year':1, 'One year':2})
    df['Paperless Billing'] = df['Paperless Billing'].map({'No':0, 'Yes':1})
    df['Payment Method'] = df['Payment Method'].map({'Mailed check':0, 'Electronic check':1, 'Bank transfer (automatic)':2, 'Credit card (automatic)':3})
    
    df['City Number'] = le.transform(df['City'])
    
    cols = ['CustomerID', 'City Number', 'Zip Code',
       'Lat Long', 'Latitude', 'Longitude', 'Gender', 'Senior Citizen',
       'Partner', 'Dependents', 'Tenure Months', 'Phone Service',
       'Multiple Lines', 'Internet Service', 'Online Security',
       'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV',
       'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method',
       'Monthly Charges', 'Total Charges', 'Churn Value']
    df = df[cols]
    
    return df



In [21]:
df = cleaning_fct(df)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         7043 non-null   object 
 1   City Number        7043 non-null   int32  
 2   Zip Code           7043 non-null   int64  
 3   Lat Long           7043 non-null   object 
 4   Latitude           7043 non-null   float64
 5   Longitude          7043 non-null   float64
 6   Gender             7043 non-null   int64  
 7   Senior Citizen     7043 non-null   int64  
 8   Partner            7043 non-null   int64  
 9   Dependents         7043 non-null   int64  
 10  Tenure Months      7043 non-null   int64  
 11  Phone Service      7043 non-null   int64  
 12  Multiple Lines     7043 non-null   int64  
 13  Internet Service   7043 non-null   int64  
 14  Online Security    7043 non-null   int64  
 15  Online Backup      7043 non-null   int64  
 16  Device Protection  7043 

In [25]:
df.columns

Index(['CustomerID', 'City Number', 'Zip Code', 'Lat Long', 'Latitude',
       'Longitude', 'Gender', 'Senior Citizen', 'Partner', 'Dependents',
       'Tenure Months', 'Phone Service', 'Multiple Lines', 'Internet Service',
       'Online Security', 'Online Backup', 'Device Protection', 'Tech Support',
       'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing',
       'Payment Method', 'Monthly Charges', 'Total Charges', 'Churn Value'],
      dtype='object')

In [27]:
df.head()

Unnamed: 0,CustomerID,City Number,Zip Code,Lat Long,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Value
0,3668-QPYBK,562,90003,"33.964131, -118.272783",33.964131,-118.272783,1,0,0,0,2,1,0,0,1,1,0,0,0,0,0,1,0,53.85,108.15,1
1,9237-HQITU,562,90005,"34.059281, -118.30742",34.059281,-118.30742,0,0,0,1,2,1,0,1,0,0,0,0,0,0,0,1,1,70.7,151.65,1
2,9305-CDSKC,562,90006,"34.048013, -118.293953",34.048013,-118.293953,0,0,0,1,8,1,1,1,0,0,1,0,1,1,0,1,1,99.65,820.5,1
3,7892-POOKP,562,90010,"34.062125, -118.315709",34.062125,-118.315709,0,0,1,1,28,1,1,1,0,0,1,1,1,1,0,1,1,104.8,3046.05,1
4,0280-XJGEX,562,90015,"34.039224, -118.266293",34.039224,-118.266293,1,0,0,1,49,1,1,1,0,1,1,0,1,1,0,1,2,103.7,5036.3,1


In [69]:
df.sort_values(by = 'Total Charges', ascending = True).head(20)

Unnamed: 0,CustomerID,City Number,Zip Code,Lat Long,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Value
4331,7644-OMVMY,562,90029,"34.089953, -118.294824",34.089953,-118.294824,1,0,1,1,0,1,0,2,2,2,2,2,2,2,1,0,0,19.85,,0
6840,2775-SEFEE,1100,90744,"33.782068, -118.262263",33.782068,-118.262263,1,0,0,1,0,1,1,0,1,1,0,1,0,0,1,1,2,61.9,,0
6772,4075-WKNIU,66,90201,"33.970343, -118.171368",33.970343,-118.171368,0,0,1,1,0,1,1,0,0,1,1,1,1,0,1,0,0,73.35,,0
4687,3213-VVOLG,982,92585,"33.739412, -117.173334",33.739412,-117.173334,1,0,1,1,0,1,1,2,2,2,2,2,2,2,1,0,0,25.35,,0
2856,1371-DWPAZ,820,95569,"40.363446, -123.835041",40.363446,-123.835041,0,0,1,0,0,0,2,0,1,1,1,1,1,0,1,0,3,56.05,,0
2234,4472-LVYGI,865,92408,"34.084909, -117.258107",34.084909,-117.258107,0,0,1,0,0,0,2,0,1,0,1,1,1,0,1,1,2,52.55,,0
5104,2520-SGTTA,71,95005,"37.078873, -122.090386",37.078873,-122.090386,0,0,1,1,0,1,0,2,2,2,2,2,2,2,1,0,0,20.0,,0
5719,2923-ARZLG,501,91750,"34.144703, -117.770299",34.144703,-117.770299,1,0,1,1,0,1,0,2,2,2,2,2,2,2,2,1,0,19.7,,0
2438,3115-CZMZD,450,93526,"36.869584, -118.189241",36.869584,-118.189241,1,0,0,0,0,1,0,2,2,2,2,2,2,2,1,0,0,20.25,,0
2568,5709-LVOEQ,888,94401,"37.590421, -122.306467",37.590421,-122.306467,0,0,1,0,0,1,0,0,1,1,1,0,1,1,1,0,0,80.85,,0
