In [1]:
# Import data
import numpy as np
import pandas as pd 
import os

In [2]:
# Import package
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
import matplotlib.cm as cm

import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder               # conversion
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.ensemble import RandomForestClassifier          # Random Forest
from sklearn.svm import SVC, LinearSVC                       # SVC
from sklearn.linear_model import LogisticRegression          # Logic Regression
from sklearn.neighbors import KNeighborsClassifier           # KNN
from sklearn.naive_bayes import GaussianNB                   # Naive Bayes
from sklearn.tree import DecisionTreeClassifier              # Decision Tree
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier     

from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.ensemble import VotingClassifier

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import warnings
warnings.filterwarnings('ignore')

get_ipython().magic('matplotlib inline')


ModuleNotFoundError: No module named 'xgboost'

In [None]:
# read data file
telcom=pd.read_csv(r"F:\data\WA_Fn-UseC_-Telco-Customer-Churn.csv")
telcom.head(10)
telcom.shape
telcom.describe()

In [None]:
# Data Clean

# Missing Value
pd.isnull(telcom).sum()
telcom["Churn"].value_counts()
telcom.info()

telcom['TotalCharges']=telcom['TotalCharges'].convert_objects(convert_numeric=True) # convert_numeric=True表示强制转换数字(包括字符串)，不可转换的值变为NaN
telcom["TotalCharges"].dtypes

# Check Missing Value Again
pd.isnull(telcom["TotalCharges"]).sum()
# Delet them
telcom.dropna(inplace=True)
telcom.shape


# Norms
# Yes=1, No=0
telcom['Churn'].replace(to_replace = 'Yes', value = 1,inplace = True)
telcom['Churn'].replace(to_replace = 'No', value = 0,inplace = True)
telcom['Churn'].head()

telcom['Churn'].replace(to_replace='Yes', value=1, inplace=True)
telcom['Churn'].replace(to_replace='No',  value=0, inplace=True)
telcom['Churn'].head()

In [None]:
#Data Visualization
# Check churn customer proportion
"""
Parameters for pie chart:
labels        - Explanation text displayed outside each slice
explode       - Distance from the center for each slice
startangle    - Starting angle for drawing; default is from x-axis positive direction counterclockwise. Setting to 90 starts from y-axis positive.
shadow        - Whether to add shadow
labeldistance - Label position relative to radius, if <1, displays inside pie chart
autopct       - Controls the format of percentage labels, e.g., '%1.1f' indicates one decimal place
pctdistance   - Position scale for autopct
radius        - Controls pie chart radius
"""
churnvalue = telcom["Churn"].value_counts()
labels = telcom["Churn"].value_counts().index

rcParams["figure.figsize"] = 6,6
plt.pie(churnvalue, labels=labels, colors=["whitesmoke","yellow"], explode=(0.1,0), autopct='%1.1f%%', shadow=True)
plt.title("Proportions of Customer Churn")
plt.show()

In [None]:

# Effects of gender, senior citizen, partner, dependents on churn rate
f, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,10))

plt.subplot(2,2,1)
gender = sns.countplot(x="gender", hue="Churn", data=telcom, palette="Pastel2") # palette parameter sets color, here as theme color Pastel2
plt.xlabel("gender")
plt.title("Churn by Gender")

plt.subplot(2,2,2)
seniorcitizen = sns.countplot(x="SeniorCitizen", hue="Churn", data=telcom, palette="Pastel2")
plt.xlabel("senior citizen")
plt.title("Churn by Senior Citizen")

plt.subplot(2,2,3)
partner = sns.countplot(x="Partner", hue="Churn", data=telcom, palette="Pastel2")
plt.xlabel("partner")
plt.title("Churn by Partner")

plt.subplot(2,2,4)
dependents = sns.countplot(x="Dependents", hue="Churn", data=telcom, palette="Pastel2")
plt.xlabel("dependents")
plt.title("Churn by Dependents")


In [None]:

# Feature extraction
charges = telcom.iloc[:,1:20]
# Encode features
"""
Encoding for discrete features:
1. If values have no size meaning (e.g., color: [red, blue]), use one-hot encoding
2. If values have size meaning (e.g., size: [X, XL, XXL]), use mapping to numbers {X:1, XL:2, XXL:3}
"""
corrDf = charges.apply(lambda x: pd.factorize(x)[0])
corrDf.head()

In [None]:
# Construct correlation matrix
corr = corrDf.corr()
corr

In [None]:
# Display correlation matrix with heatmap
'''
heatmap        - Displays coefficient matrix with heatmap
linewidths     - Gap size between cells
annot          - Whether to display coefficient value in each cell
'''
plt.figure(figsize=(20,16))
ax = sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, 
                 linewidths=0.2, cmap="YlGnBu", annot=True)
plt.title("Correlation between variables")
# Conclusion: From the above, there is a strong correlation between internet services, network security services, online backup services, device protection services, technical support services, streaming TV, and streaming movies. Multi-line business and phone service are also strongly correlated, and all show strong positive correlations.

In [None]:
# Apply one-hot encoding
tel_dummies = pd.get_dummies(telcom.iloc[:,1:21])
tel_dummies.head()

In [None]:
# Correlation between telecom churn and various variables
plt.figure(figsize=(15,8))
tel_dummies.corr()['Churn'].sort_values(ascending=False).plot(kind='bar')
plt.title("Correlations between Churn and variables")


# From the chart, it can be seen that variables 'gender' and 'PhoneService' are close to zero in correlation, so they have minimal effect on predicting telecom churn and can be discarded.


In [None]:
# Impact of services on churn rate
covariables = ["OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(16,10))
for i, item in enumerate(covariables):
    plt.subplot(2,3,(i+1))
    ax = sns.countplot(x=item, hue="Churn", data=telcom, palette="Pastel2", order=["Yes","No","No internet service"])
    plt.xlabel(str(item))
    plt.title("Churn by " + str(item))
    i = i + 1
plt.show()


In [None]:
# Impact of contract type on churn rate
sns.barplot(x="Contract", y="Churn", data=telcom, palette="Pastel1", order=['Month-to-month', 'One year', 'Two year'])
plt.title("Churn by Contract type")


In [None]:
# Impact of payment method on churn rate
plt.figure(figsize=(10,5))
sns.barplot(x="PaymentMethod", y="Churn", data=telcom, palette="Pastel1", order=['Bank transfer (automatic)', 'Credit card (automatic)', 'Electronic check','Mailed check'])
plt.title("Churn by PaymentMethod type")


In [None]:
# ## 5. Data Preprocessing

# As observed, 'CustomerID' is a random character string representing each customer and has no impact on modeling, so we choose to drop this column; 'gender' and 'PhoneService' have low correlation with churn rate, so we can ignore them.

telcomvar = telcom.iloc[:,2:20]
telcomvar.drop("PhoneService", axis=1, inplace=True)

# Extract ID
telcom_id = telcom['customerID']

telcomvar.head()

In [None]:
# Standardize tenure, MonthlyCharges, and TotalCharges to mean of 0 and variance of 1
"""
Standardize the data to ensure each feature has a variance of 1 and mean of 0, so prediction results are not dominated by features with large values.
"""
scaler = StandardScaler(copy=False)
# fit_transform() first fits the data, then transforms it into a standard format
scaler.fit_transform(telcomvar[['tenure', 'MonthlyCharges', 'TotalCharges']])


In [None]:
# transform() standardizes the data by centering and scaling
telcomvar[['tenure', 'MonthlyCharges', 'TotalCharges']] = scaler.transform(telcomvar[['tenure', 'MonthlyCharges', 'TotalCharges']])

In [None]:
# Use boxplot to check for outliers in the data
plt.figure(figsize=(8,4))
numbox = sns.boxplot(data=telcomvar[['tenure', 'MonthlyCharges', 'TotalCharges']], palette="Set2")
plt.title("Check outliers of standardized tenure, MonthlyCharges, and TotalCharges")

In [None]:
# Check the values in object type fields
def uni(columnlabel):
    print(columnlabel,"--" ,telcomvar[columnlabel].unique())  # The unique function removes duplicates and returns unique values

telcomobject=telcomvar.select_dtypes(['object'])
for i in range(0,len(telcomobject.columns)):
    uni(telcomobject.columns[i])


# Based on previous results, there is a "No internet service" in six variables. Customers who do not use any internet products have a low churn rate, so "No internet service" can be treated the same as "No." Therefore, "No internet service" can be replaced with "No."

In [None]:
telcomvar.replace(to_replace='No internet service', value='No', inplace=True)
telcomvar.replace(to_replace='No phone service', value='No', inplace=True)
for i in range(0,len(telcomobject.columns)):
    uni(telcomobject.columns[i])

In [None]:

# Use Scikit-learn's label encoding to convert categorical data into integer encoding
def labelencode(columnlabel):
    telcomvar[columnlabel] = LabelEncoder().fit_transform(telcomvar[columnlabel])

for i in range(0,len(telcomobject.columns)):
    labelencode(telcomobject.columns[i])

for i in range(0,len(telcomobject.columns)):
    uni(telcomobject.columns[i])

In [None]:
# ## 6. Model Building

# ### (1) Create Training and Testing Datasets
"""
We need to split the dataset into training and testing sets for validation.
Since our dataset is unbalanced, it’s best to use stratified cross-validation to ensure that both the training and testing sets retain the proportions of each class sample.
StratifiedShuffleSplit is a cross-validation function that randomly splits the sample data into training and testing sets based on the given ratio.
Parameter n_splits: the number of train/test sets to split into, can be set as needed, default is 10
Parameters test_size and train_size are used to set the proportions for train/test
Parameter random_state controls the random shuffling of samples
"""
X=telcomvar
y=telcom["Churn"].values

sss=StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
print(sss)
print("Number of splits for training and testing data:",sss.get_n_splits(X,y))

In [None]:
# Create training and testing sets
for train_index, test_index in sss.split(X, y):
    print("train:", train_index, "test:", test_index)
    X_train,X_test=X.iloc[train_index], X.iloc[test_index]
    y_train,y_test=y[train_index], y[test_index]

In [None]:
# Output the size of the datasets
print('Original data features:', X.shape,
      'Training data features:',X_train.shape,
      'Testing data features:',X_test.shape)

print('Original data labels:', y.shape,
      '   Training data labels:',y_train.shape,
      '   Testing data labels:',y_test.shape)

In [None]:
# ### (2) Select Machine Learning Algorithms
# Use classification algorithms, selecting 10 different classifiers here
Classifiers=[["Random Forest",RandomForestClassifier()],
             ["Support Vector Machine",SVC()],
             ["LogisticRegression",LogisticRegression()],
             ["KNN",KNeighborsClassifier(n_neighbors=5)],
             ["Naive Bayes",GaussianNB()],
             ["Decision Tree",DecisionTreeClassifier()],
             ["AdaBoostClassifier", AdaBoostClassifier()],
             ["GradientBoostingClassifier", GradientBoostingClassifier()],
             ["XGB", XGBClassifier()],
             ["CatBoost", CatBoostClassifier(logging_level='Silent')]  
]

In [None]:
# ### (3) Train Models
Classify_result=[]
names=[]
prediction=[]
for name,classifier in Classifiers:
    classifier=classifier
    classifier.fit(X_train,y_train)
    y_pred=classifier.predict(X_test)
    recall=recall_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred)
    class_eva=pd.DataFrame([recall,precision])
    Classify_result.append(class_eva)
    name=pd.Series(name)
    names.append(name)
    y_pred=pd.Series(y_pred)
    prediction.append(y_pred)

In [None]:
# ### (4) Evaluate Models

# Evaluate models
"""
Recall: the proportion of true positives among all actual positives (higher values are better, with 1 being ideal)
Precision: the proportion of true positives among all predicted positives (higher values are better, with 1 being ideal)
F1-Score: a metric that combines Precision and Recall into a single score
F1-Score ranges from 0 to 1, where 1 represents the best possible model and 0 represents the worst.
"""

names=pd.DataFrame(names)
names=names[0].tolist()
result=pd.concat(Classify_result,axis=1)
result.columns=names
result.index=["recall","precision","f1score"]
result


# Conclusion: Among the 10 classification algorithms, Naive Bayes achieved the highest F1-Score of 63.31%, making it the best-performing model.


In [None]:
# ## 7. Implementation Plan

# Predict the dataset features (as there is no provided prediction dataset, we select the last 10 rows as the dataset to predict)
pred_X = telcomvar.tail(10)

# Extract customerID
pre_id = telcom_id.tail(10)

# Use the Naive Bayes model to predict the churn status in the prediction dataset
model = GaussianNB()
model.fit(X_train,y_train)
pred_y = model.predict(pred_X)

# Prediction results
predDf = pd.DataFrame({'customerID':pre_id, 'Churn':pred_y})
predDf