In [2]:
import pandas as pd 
from sklearn.cluster import KMeans

In [3]:
# Load customer churn dataset 
customer_data = pd.read_csv('../data/telco_customer_churn.csv')

In [5]:
customer_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
customer_data.shape

(7043, 21)

In [7]:
customer_data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [14]:
# Check for missing values in the customer_data DataFrame
print(customer_data.isna().sum())

# Check for unique values in the TotalCharges column
print(customer_data['TotalCharges'].unique())


customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64
['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']


In [36]:
customer_data.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [37]:
customer_data.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [10]:
# Select relevant features for clustering 
customer_features = customer_data[['tenure', 'MonthlyCharges', 'TotalCharges']]
customer_features

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,1,29.85,29.85
1,34,56.95,1889.5
2,2,53.85,108.15
3,45,42.30,1840.75
4,2,70.70,151.65
...,...,...,...
7038,24,84.80,1990.5
7039,72,103.20,7362.9
7040,11,29.60,346.45
7041,4,74.40,306.6


In [12]:
customer_features.isnull().sum()

tenure            0
MonthlyCharges    0
TotalCharges      0
dtype: int64

In [16]:
# Check for unique values in the TotalCharges column
print(customer_data['tenure'].unique())
print(customer_data['MonthlyCharges'].unique())
print(customer_data['TotalCharges'].unique())


[ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
[29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']


In [26]:
# Convert non-numeric values to NaN
customer_data['TotalCharges'] = pd.to_numeric(customer_data['TotalCharges'], errors='coerce')

# Check for missing values in the TotalCharges column
print(customer_data['TotalCharges'].isna().sum())


11


In [27]:
# Calculate mean value of TotalCharges column
mean_total_charges = customer_data['TotalCharges'].mean()

# Replace null values with mean value
customer_data['TotalCharges'].fillna(mean_total_charges, inplace=True)
# Check for missing values in the TotalCharges column
print(customer_data['TotalCharges'].isna().sum())


0


In [19]:
# Convert non-numeric values to NaN
customer_data['tenure'] = pd.to_numeric(customer_data['tenure'], errors='coerce')

# Check for missing values in the TotalCharges column
print(customer_data['tenure'].isna().sum())

0


In [28]:
# Convert non-numeric values to NaN
customer_data['MonthlyCharges'] = pd.to_numeric(customer_data['MonthlyCharges'], errors='coerce')

# Check for missing values in the TotalCharges column
print(customer_data['MonthlyCharges'].isna().sum())

0


In [30]:
# Check datatype of TotalCharges column
print(customer_data['TotalCharges'].dtype)


float64


In [31]:
# Convert TotalCharges column to numeric datatype
customer_data['TotalCharges'] = pd.to_numeric(customer_data['TotalCharges'], errors='coerce')


In [42]:
# check if col2 contains string values
if customer_features['TotalCharges'].dtype == 'object':
    print('TotalCharges contains string values')
else:
    print('TotalCharges does not contain string values')

TotalCharges contains string values


In [None]:
# customer_data = customer_data.replace('', 0)


In [43]:
customer_features = customer_data[['tenure', 'MonthlyCharges', 'TotalCharges']]
customer_features

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,1,29.85,29.85
1,34,56.95,1889.50
2,2,53.85,108.15
3,45,42.30,1840.75
4,2,70.70,151.65
...,...,...,...
7038,24,84.80,1990.50
7039,72,103.20,7362.90
7040,11,29.60,346.45
7041,4,74.40,306.60


In [44]:
# Build k-means clustering model with 3 clusters 
kmeans = KMeans(n_clusters=3) 
kmeans.fit(customer_features)



In [50]:
# Get clustering vector values for MonthlyCharges column
monthly_charges_cluster = kmeans.predict(customer_features)


In [51]:
# Bind monthly charges column to clustering vector and store in month_group
month_group = pd.concat([customer_data['MonthlyCharges'], pd.Series(monthly_charges_cluster)], axis=1)
month_group.columns = ['MonthlyCharges', 'Cluster']



In [52]:
# Convert month_group matrix into a data frame
month_group = pd.DataFrame(month_group)



In [53]:
# Separate clusters with their values
cluster_0 = month_group[month_group['Cluster'] == 0]['MonthlyCharges']
cluster_1 = month_group[month_group['Cluster'] == 1]['MonthlyCharges']
cluster_2 = month_group[month_group['Cluster'] == 2]['MonthlyCharges']

In [54]:
cluster_0

0       29.85
1       56.95
2       53.85
3       42.30
4       70.70
        ...  
7035    78.70
7036    60.65
7037    21.15
7040    29.60
7041    74.40
Name: MonthlyCharges, Length: 4157, dtype: float64

In [55]:
cluster_1

12      100.35
13      103.70
15      113.25
17      106.70
28       90.25
         ...  
7022    104.95
7023    103.50
7034    102.95
7039    103.20
7042    105.65
Name: MonthlyCharges, Length: 1260, dtype: float64

In [56]:
cluster_2


8       104.80
9        56.15
14      105.50
23       59.90
24       59.60
         ...  
7024     84.80
7028     64.10
7031     60.00
7033     69.50
7038     84.80
Name: MonthlyCharges, Length: 1626, dtype: float64