# Swanalytics - Swan Telecom 

Using Decision Trees and Random Forest to make predictions on customers most likely to churn 

In [None]:
# imports
# Core libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from time import time  # For timing model runs

# Sklearn - model selection & evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn import metrics  # Optional if you're using things like metrics.classification_report

# Decision Tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import tree

# Random Forest & Extra Trees
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import ExtraTreesClassifier as ET
from sklearn.model_selection import StratifiedKFold

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/gdiwa23/Swanalytics/refs/heads/main/1%20Project%20Data%20-%20Telco_Churn.csv')

### first look at data

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.isnull().sum() / len(df) * 100

In [None]:
df.columns

*  After a first look at the data, the following actions must be taken to clean up the df, in preparation for train test splitting

In [None]:
df = df.drop(columns=["Count","City","Country","State","Lat Long","Churn Label","Churn Reason"]) # drop columns we don't want to use 

In [None]:
# Total charges needs to be cast to float

df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce') # casts to float
df['Total Charges'] = df['Total Charges'].fillna(0) # found NaN values after casting -> fill with 0
df['Total Charges'].isnull().sum() # check = 0 nulls

In [None]:
df.set_index('CustomerID', inplace=True) # set as index, checked all are unqiue first

### Train test split

In [None]:
# set feature columns
feature_cols = df.columns.drop('Churn Value')
#feature_cols

In [None]:
# set X and y 
X = df[feature_cols].copy()  # features
y = df['Churn Value']   

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = 1)

In [None]:
# sanity checks
print(f'Train match: {len(X_train)==len(y_train)}')
print(f'Test match: {len(X_test)==len(y_test)}')

### Data Cleaning 

In [None]:
# func for cleaning 
def clean_data(df):

  assert isinstance(df, pd.DataFrame), 'Parameter needs to be a DataFrame'
    
  df_clean = df.copy() # df is X


  # OHE the service column into Fibre Optic or DSL - 0 in both indicates no internet service
    
  service_dummies = pd.get_dummies(df_clean['Internet Service'], prefix='Service', drop_first=True, dtype=int)
  df_clean = pd.concat([df_clean, service_dummies], axis=1)
  df_clean = df.drop(columns=['Internet Service'])

  df_clean = pd.get_dummies(df_clean, columns = ['Contract'], drop_first = True, prefix = 'Contract', dtype = int)
  df_clean = pd.get_dummies(df_clean, columns=['Payment Method'], drop_first=True, prefix='Payment_Method', dtype=int)


    
  # Label Encoding

  df_clean['Gender'] = df_clean['Gender'].map({'Female': 0, 'Male':1, 0:0, 1:1})
  df_clean['Senior Citizen'] = df_clean['Senior Citizen'].map({'No': 0, 'Yes':1, 0:0, 1:1})
  df_clean['Partner'] = df_clean['Partner'].map({'No': 0, 'Yes':1, 0:0, 1:1})
  df_clean['Dependents'] =  df_clean['Dependents'].map({'No': 0, 'Yes':1, 0:0, 1:1})
  df_clean['Phone Service'] = df_clean['Phone Service'].map({'No': 0, 'Yes':1, 0:0, 1:1})
  df_clean['Multiple Lines'] = df_clean['Multiple Lines'].map({'No': 0, 'Yes':1,'No phone service':0, 0:0, 1:1})

  df_clean['Online Security'] = df_clean['Online Security'].map({'No':0, 'Yes':1, 'No internet service':0, 0:0, 1:1})
  df_clean['Online Backup'] = df_clean['Online Backup'].map({'No':0, 'Yes':1, 'No internet service':0, 0:0, 1:1})
  df_clean['Device Protection'] = df_clean['Device Protection'].map({'No':0, 'Yes':1, 'No internet service':0, 0:0, 1:1})
  df_clean['Tech Support'] = df_clean['Tech Support'].map({'No':0, 'Yes':1, 'No internet service':0, 0:0, 1:1})

  #TV - No internet service has been combined with No
  df_clean['Streaming TV'] = df['Streaming TV'].replace(['No internet service', 'No'], 'No')
  df_clean['Streaming TV'] = df_clean['Streaming TV'].map({'No': 0, 'Yes':1, 0:0, 1:1})
  # Movies- No internet service has been combined with No
  df_clean['Streaming Movies'] = df['Streaming Movies'].replace(['No internet service', 'No'], 'No')
  df_clean['Streaming Movies'] = df_clean['Streaming Movies'].map({'No': 0, 'Yes':1, 0:0, 1:1})
  # Paperless Billing
  df_clean['Paperless Billing'] = df_clean['Paperless Billing'].map({'No': 0, 'Yes':1, 0:0, 1:1})




  return df_clean

#### clean the data using the function

In [None]:
X_train_fe = clean_data(X_train)

In [None]:
X_test_fe = clean_data(X_test)

In [None]:
## no churn DATA TO PREDICT ON 
df_nochurn = df[df['Churn Value'] ==  0]
df_nochurn = df_nochurn.drop(columns=["Churn Value"])

In [None]:
df_nochurn = df_nochurn[feature_cols]
df_nochurn_fe = clean_data(df_nochurn)

In [None]:
# sanity checks - just to make sure all good

print(f'Train post split match: {len(X_train_fe)==len(y_train)}')
print(f'Test post split match: {len(X_test_fe)==len(y_test)}\n')

print(f'Train post data cleaning match: {X_train_fe.shape[0] == X_train.shape[0]}')
print(f'Test post data cleaning match: {X_test_fe.shape[0] == X_test.shape[0]}\n')

print(f'Churn data post cleaning match: {df_nochurn_fe.shape[0]==df_nochurn.shape[0]}')


### Decision Tree + Grid Search

* Start with the initial decision tree and use as benchmark.

* The decision tree achieves a solid 79% accuracy on the training set, but deeper analysis reveals that while precision is acceptable at 68%, recall is significantly lower at 42%. This suggests the model is better at correctly identifying churners when it predicts churn BUT misses a large portion of actual churners. Improving recall and precision are key in order to maximise the models ability to predict churners and optimise Market Team's efforts in targetting churners.
* With a limited slot of just Top 500 to deliver to the Marketing Team, we need to focus on Precision and Recall as metrics to ensure that the model captures customers who are most likely to churn.
* This leads into the use of grid search to optimise parameters


> Decision Tree Key Metrics: Train Set  
> 
> Accuracy:   0.7903798367057153  
> Precision:  0.6801705756929638  
> Recall:     0.42001316655694537  
> F1:         0.5193325193325193

In [None]:
# first decision tree
dt = DecisionTreeClassifier(max_depth=4, # initially 3, but changed to 4 and saw improvement
                            random_state=2)
dt.fit(X_train_fe, y_train) #fit data to model

In [None]:
# plot the first tree
fig = plt.figure(figsize=(15,10))
tree_1 = tree.plot_tree(dt,
                   feature_names=X_train_fe.columns,
                   class_names=['Churned', 'Stayed'],
                   filled=True)
plt.show()

In [None]:
## accuracy score ## 0.79 and 0.79 , not bad
print(f'Score on training set: {dt.score(X_train_fe, y_train)}')
print(f'Score on testing set: {dt.score(X_test_fe, y_test)}')

In [None]:
def apr(y_real, y_pred):
    """ Calculates accuracy, precision, recall
        Requires predicted value first, and then the real value
    """
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)

    print(f"Accuracy:   {accuracy}")
    print(f"Precision:  {precision}")
    print(f"Recall:     {recall}")
    print(f"F1:         {f1}")
    #return accuracy, precision, recall, f1

In [None]:
# Decision tree 1 key metrics 

y_pred_dt = dt.predict(X_train_fe) # add y_pred col # decision tree pred
print('Decision Tree Key Metrics: Train Set\n')
apr(y_train, y_pred_dt)

#### Grid Search with DT
* To improve on the intial results of the dt, we are using grid search in order to optimise the hyperparameters used in the model
* cv = 10 , means that the grid search will cross validate across 10 slices the data 
* grid search will give us the .best_estimator_ which will be used to train the data

* The Random Forest model achieves an accuracy of 80.7%, indicating solid overall performance. With a precision of 63.8%, it reliably identifies actual churners when it predicts churn. A recall of 65.1% shows the model captures a good portion of actual churn cases, better than the decision tree. The F1 score of 64.5% reflects a balanced trade-off, making this model suitable for churn detection with moderate risk tolerance.

>  Decision Tree (+GridSearch) Key Metrics: Train Set
> * Accuracy:   0.8065317713880015
> * Precision:  0.6384764364105875
> * Recall:     0.6510862409479921
> * F1:         0.6447196870925684

In [None]:
# grid searach
grid = GridSearchCV(estimator = DecisionTreeClassifier(random_state=2),
                                   param_grid = {'max_depth': [5, 7, 10],                   # the options you want to explore
                                  'min_samples_split': [10, 50, 100, 150, 200], # ''
                                  'min_samples_leaf': [2, 3, 4, 5, 6, 7],        # ''
                                  'max_features':[2, 3, 4, 5, 6, 7, 8, 10, 12]},      # number of features to use                       ## up max features later
                    cv = 5,              # 10 folds, cross validate 10 times
                    refit = True,         # refitting = retraining on the K folds , cross validation, swaps
                    verbose = 1,          # how much we get told about what went on in the gridsearch
                    scoring = 'recall') # changed to recall to maximise this

In [None]:
# timer, to see how long model takes to finish
now = time()

# fit the model
grid.fit(X_train_fe, y_train)     # time between pressing run and model finishing
print(f' Time in seconds: {time() - now}')

In [None]:
# the best version of the DT according to gridsearch -> grid.best_estimator_

dt_best_estimator = grid.best_estimator_ # train dt model on grid best estimator
dt_best_estimator.fit(X_train_fe, y_train)

In [None]:
# dt_best_estimator tree plot

fig = plt.figure(figsize=(15,10))
thing = tree.plot_tree(dt_best_estimator,
                   feature_names=X_train_fe.columns,
                   class_names=['Churned', 'Stayed'],
                   filled=True)

In [None]:
# score for training and test

print(f'Score on training set: {dt_best_estimator.score(X_train_fe, y_train)}')
print(f'Score on testing set: {dt_best_estimator.score(X_test_fe, y_test)}')

In [None]:
# Key metrics for Decision Tree + GridSearch

y_pred_dt_best_estimator = dt_best_estimator.predict(X_train_fe) # add y_pred col # decision tree pred
print('Decision Tree (+GridSearch) Key Metrics: Train Set\n')
apr(y_train, y_pred_dt_best_estimator)

#### DT feature importances

* Feature importances informs us which features had the most influence in producing the model's predictions.
> Top 10 Feature importances 
> * Monthly Charges: 0.2597
> * Payment Method: 0.1938
> * Dependents: 0.1410
> * Contract: 0.1328
> * Paperless Billing: 0.0932
> * Tenure Months: 0.0490
> * Streaming Movies: 0.0210
> * Device Protection: 0.0189
> * Latitude: 0.0177
> * Longitude: 0.0173

In [None]:
# view feature importance ## same as before gs

dt_best_estimator.feature_importances_

# Pair feature names with their importance values
importance = list(zip(feature_cols, dt_best_estimator.feature_importances_))

# Sort the list by importance in descending order
importance_sorted = sorted(importance, key=lambda x: x[1], reverse=True)

# Display
for feature, score in importance_sorted:
    print(f"{feature}: {score:.4f}")

## most important features according to the decision tree

In [None]:
importance_df = pd.DataFrame(importance_sorted, columns=["feature", "score"])
importance_df['score'] = round(importance_df['score'] * 100, 2)

In [None]:
features_10_dt = sns.barplot(
    data=importance_df,
    x=importance_df['feature'][:10],
    y='score',
    color='darkorange'
)

for spine in features_10_dt.spines.values(): # spine is border of plot
    spine.set_color('grey')
    spine.set_linewidth(1)

plt.xticks(rotation=45, color='grey')        # x-axis tick labels grey
plt.yticks(color='grey')                      # y-axis tick labels grey
plt.xlabel('Top Features', fontsize=12, color='grey')   # x-label grey
plt.ylabel('Score %', fontsize=12, color='grey')        # y-label grey
plt.title('Top 10 Feature Importances DT + Grid search (%)', fontsize=14, color='grey')  # title grey

plt.tight_layout()
#plt.savefig('features_10_dt_.png', dpi=300, bbox_inches='tight', transparent=False) # transparent !
plt.show()

### Random Forest

* Ensemble of Decision Trees: Combines multiple decision trees to improve accuracy and control overfitting.
* Randomness: Uses bootstrap samples of data + random subsets of features at each split.

In [None]:
# initialise the model
rf = RF(n_estimators=100, max_depth=9, random_state=2) # 50 trees, 4 levels of questions

In [None]:
# fit the model
rf.fit(X_train_fe, y_train)

In [None]:
# get the  accuracy score ! cv = five k fold cross validation !
rf_score = cross_val_score(rf, X_train_fe, y_train, cv=10)

print(f'The accuracy of RF is: {rf_score}\n') # all the 10 cv folds
print(f'The mean accuracy of RF is: {rf_score.mean()}') # the mean of the 5

In [None]:
rf_test_score = cross_val_score(rf, X_test_fe, y_test, cv=10)
print(f'The mean accuracy of RF is: {rf_test_score.mean()}') # the mean of the 5 -- v good accuracy and not very overfit !!

### Grid Search on RF 
* Grid Search on Random Forest to systematically optimise hyperparameters and improve model performance for churn prediction
* Random Forest is robust and handles but performance heavily depends on tuning parameters
* The main motivation for using Grid Search was to balance recall and precision whilst minimising overfit
* added class weight: balanced to grid search to reduce overfit
  
#### The final version of RF + GS model
* The model aims to balance overfitting while maximizing F1 (harmonizing precision and recall).
* Tree depth was limited to reduce model complexity.
* min_samples_split and min_samples_leaf were increased to encourage generalization and reduce sensitivity to noise.
* max_features was restricted to control model greediness and curb overfitting.
* class_weight='balanced' ensures fair treatment of the smaller class.
* The scoring metric was switched from accuracy to F1 to better reflect the goal of optimizing both precision and recall.


In [None]:
# stratified k fold sampling 
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)

In [None]:
gs = GridSearchCV(
    estimator=RF(random_state=2),
    param_grid={
            'n_estimators': [100, 200], #number of trees in the forest
            'max_depth': [7,8,9],                              # [7, 9, 12]
            'min_samples_split': [30,40],                                        # [20, 50, 100]
            'min_samples_leaf': [6,8],
            'max_features': [0.65], ## make less greedy      # [0.4, 0.6, 0.7]
            'class_weight': ['balanced'] ###### added this
    },
    cv=skf,             # 5-fold cross-validation
    refit=True,         # Refits the best model on the whole training set
    verbose=1,          # Output the progress
    scoring='f1',       # changed to F1 , want to balance precision and recall , guide gs to prioritise this
    n_jobs=-1           # parallel running
)

In [None]:
now = time()
gs.fit(X_train_fe, y_train)
print(f' Time in seconds: {time() - now}')

In [None]:
print(gs.best_score_)
print(gs.best_params_)

In [None]:
# get scores

rf_best_estimator = gs.best_estimator_
print(f'RF train accuracy: {rf_best_estimator.score(X_train_fe, y_train)}')
# Use it to score on the testing set
print(f'RF test accuracy: {rf_best_estimator.score(X_test_fe, y_test)}')

## accuracy is better than the single decision tree (0.79, train), but suggests model is overfit on test
## 0.837 , 0.777, ## less over fit

In [None]:
# Key metrics for Random + GridSearch
X_train_rf_results = X_train_fe.copy()


y_pred_rf_best_estimator = rf_best_estimator.predict(X_train_fe) # add y_pred col # decision tree pred

X_train_rf_results['y_pred'] = y_pred_rf_best_estimator

print('Random Forest (+GridSearch) Key Metrics: Train Set\n')
apr(y_train, y_pred_rf_best_estimator)


In [None]:
X_test_rf_results = X_test.copy()
y_pred_rf_best_estimator = rf_best_estimator.predict(X_test_fe) # add y_pred col # decision tree pred
X_test_rf_results['y_pred'] = y_pred_rf_best_estimator
print('Random Forest (+GridSearch) Key Metrics: Test Set\n')
apr(y_test, y_pred_rf_best_estimator)

#### feature importances

In [None]:
## various tuning results -- 

# Random Forest (+GridSearch) Key Metrics: Train Set
# 
# Accuracy:   0.8452254171104011
# Precision:  0.7943585077343039
# Recall:     0.5747202106649111
# F1:         0.6669213139801375

#Random Forest (+GridSearch) Key Metrics: Train Set

#Accuracy:   0.818778842740504
#Precision:  0.6163551401869158
#Recall:     0.8683344305464121
#F1:         0.7209620114785461

#Random Forest (+GridSearch) Key Metrics: Train Set max feat = 0.6

#Accuracy:   0.8315583954561591
#Precision:  0.6371511068334937
#Recall:     0.8716260697827518
#F1:         0.7361690297470114

#Random Forest (+GridSearch) Key Metrics: Train Set max feat = 0.75 -- v overfit tho .. 
#
#Accuracy:   0.8358182463613774
#Precision:  0.6426512968299711
#Recall:     0.8808426596445029
#F1:         0.7431269091918912

#Random Forest (+GridSearch) Key Metrics: Train Set max feat = 0.7 -- v overfit tho .. 
#
#Accuracy:   0.8374156904508342
#Precision:  0.6440516005733397
#Recall:     0.8874259381171824
#F1:         0.7464008859357697

In [None]:
# view feature importance ## same as before gs

rf_best_estimator.feature_importances_

# Pair feature names with their importance values
importance_rf = list(zip(feature_cols, rf_best_estimator.feature_importances_))

# Sort the list by importance in descending order
importance_sorted_rf = sorted(importance_rf, key=lambda x: x[1], reverse=True)

# Display
for feature, score in importance_sorted_rf:
    print(f"{feature}: {score:.4f}")

## most important features according to the decision tree

In [None]:
importance_rf_df = pd.DataFrame(importance_sorted_rf, columns=["feature", "score"])
importance_rf_df['score'] = round(importance_rf_df['score'] * 100, 2)

sns.barplot(data=importance_rf_df, x=importance_rf_df['feature'][:10], y='score', color = 'powderblue')
plt.xticks(rotation=45)  # Optional: rotate x-axis labels
plt.tight_layout()       # Optional: improve layout
plt.show()

In [None]:
## graph of top 10 features according to RF + grid search model !

top10_df = importance_rf_df.iloc[:10]

features_10_rf = sns.barplot(
    data=top10_df,
    x='feature',
    y='score',
    color='darkorange'
)

for spine in features_10_rf.spines.values():
    spine.set_color('grey')
    spine.set_linewidth(1) # grey plot boarder

plt.xticks(rotation=45, color='grey')        # x-axis tick labels grey
plt.yticks(color='grey')                      # y-axis tick labels grey
plt.xlabel('Top Features', fontsize=12, color='grey') # grey
plt.ylabel('Score %', fontsize=12, color='grey') # grey
plt.title('Top 10 Feature Importances RF + Grid search (%)', fontsize=14, color='grey') # grey

plt.tight_layout()
#plt.savefig('features_10_rf.png', dpi=300, bbox_inches='tight', transparent=False)
plt.show()

## Final Predictions using RF and Grid Search
* top 500 most likely to leave the service have been identified
* using a Random Forest model train on X_train_fe, combined with Grid Search to optimise the hyperparameters





In [None]:
# copy df
rf_results = df_nochurn_fe.copy()
# predict on no churn
rf_results[['P No Churn','P Churn']] = rf_best_estimator.predict_proba(df_nochurn_fe)

In [None]:
# add y pred to results
y_pred_rf_best_estimator = rf_best_estimator.predict(df_nochurn_fe) # add y_pred col # decision tree pred
rf_results['y_pred'] = y_pred_rf_best_estimator

In [None]:
# "y_pred","P Churn", "P No Churn"
rf_results = rf_results[["y_pred","P Churn", "P No Churn"]]
# sort values by 500 
rf_results = rf_results.sort_values(by=['P Churn'] , ascending=False)

In [None]:
rf_results.head()

In [None]:
# save results to csv - top 500 churners by probability from existing customers existing customers, model trained on X_train_fe

rf_results.head(500).to_csv('top500_RF.csv', index=True)