In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

In [2]:
# Read dataset

%%time
df_path = 'churn_results.csv'

try:
    df = pd.read_csv(df_path)
except pd.errors.ParserError as e:
    print(f'Error while parsing CSV file: {e}')



df.shape

CPU times: user 17.4 ms, sys: 799 µs, total: 18.2 ms
Wall time: 43.1 ms


(3151, 11)

In [3]:
df.head()

Unnamed: 0,toplevelcustomerid,client_account_number,start_date,last_transaction_date,total_rev_monthly_avg,total_spend_monthly_avg,total_transactions_monthly_avg,churned,churn_prob,churn_prob_knn,churn_prob_rf
0,230845,AS60222357,2023-10-16,2023-12-30,35,382.0,4.0,0,1.0,1.0,0.472028
1,230846,AS60222358,2023-10-16,2024-01-15,4,171.0,2.0,0,0.0,0.0,0.179686
2,230847,AS60222359,2023-10-16,2024-02-27,15,1111.0,14.0,0,0.0,0.0,0.11683
3,230848,AS60222360,2023-10-16,2024-03-12,10,769.0,10.0,0,0.0,0.0,0.050564
4,230851,AS60222363,2023-10-16,2024-02-23,20,1536.0,22.0,0,0.0,0.0,0.119955


In [4]:
# Method for calculation of seperate model probability score
def calc_churn_score(final_df):
    final_df['Churn_Score'] = (final_df['churn_prob_rf']*0.99) + (final_df['churn_prob']*0.88) + (final_df['churn_prob_knn']*0.86)

    def NormalizeData(data):
        return (data - np.min(data)) / (np.max(data) - np.min(data))

    final_df['Churn_Score'] = NormalizeData(final_df['Churn_Score'])
    return final_df

In [8]:
churn_df = df.groupby(['toplevelcustomerid', 'client_account_number', 'start_date',
                       'last_transaction_date', 'total_rev_monthly_avg', 'total_spend_monthly_avg',
                       'total_transactions_monthly_avg', 'churned'], as_index=False)[['churn_prob_rf', 'churn_prob', 'churn_prob_knn']].mean()
churn_df.head()


Unnamed: 0,toplevelcustomerid,client_account_number,start_date,last_transaction_date,total_rev_monthly_avg,total_spend_monthly_avg,total_transactions_monthly_avg,churned,churn_prob_rf,churn_prob,churn_prob_knn
0,230845,AS60222357,2023-10-16,2023-12-30,35,382.0,4.0,0,0.472028,1.0,1.0
1,230846,AS60222358,2023-10-16,2024-01-15,4,171.0,2.0,0,0.179686,0.0,0.0
2,230847,AS60222359,2023-10-16,2024-02-27,15,1111.0,14.0,0,0.11683,0.0,0.0
3,230848,AS60222360,2023-10-16,2024-03-12,10,769.0,10.0,0,0.050564,0.0,0.0
4,230851,AS60222363,2023-10-16,2024-02-23,20,1536.0,22.0,0,0.119955,0.0,0.0


In [9]:
churn_df2 = calc_churn_score(churn_df)
churn_df2.head()

Unnamed: 0,toplevelcustomerid,client_account_number,start_date,last_transaction_date,total_rev_monthly_avg,total_spend_monthly_avg,total_transactions_monthly_avg,churned,churn_prob_rf,churn_prob,churn_prob_knn,Churn_Score
0,230845,AS60222357,2023-10-16,2023-12-30,35,382.0,4.0,0,0.472028,1.0,1.0,0.810919
1,230846,AS60222358,2023-10-16,2024-01-15,4,171.0,2.0,0,0.179686,0.0,0.0,0.049965
2,230847,AS60222359,2023-10-16,2024-02-27,15,1111.0,14.0,0,0.11683,0.0,0.0,0.026633
3,230848,AS60222360,2023-10-16,2024-03-12,10,769.0,10.0,0,0.050564,0.0,0.0,0.002034
4,230851,AS60222363,2023-10-16,2024-02-23,20,1536.0,22.0,0,0.119955,0.0,0.0,0.027793


In [16]:
print('Total record number = ', churn_df2.shape[0])
print('Negative record number = ', churn_df2[churn_df2.churned == 0].shape[0])
print('True Negative size = ', churn_df2[(churn_df2.churned == 0) & (churn_df2.Churn_Score < 0.5)].shape[0])
print('Positive record number = ', churn_df2[churn_df2.churned == 1].shape[0])
print('True Positive size = ', churn_df2[(churn_df2.churned == 1) & (churn_df2.Churn_Score >= 0.5)].shape[0])
print('Possible churn size = ', churn_df2[(churn_df2.churned == 0) & (churn_df2.Churn_Score >= 0.5)].shape[0])

Total record number =  3151
Negative record number =  2974
True Negative size =  2599
Positive record number =  177
True Positive size =  177
Possible churn size =  375


In [17]:
print('Possible churn revenue = ', churn_df2[(churn_df2.churned == 0) &
 (churn_df2.Churn_Score >= 0.5)].total_rev_monthly_avg.sum())

Possible churn revenue =  4276


In [18]:
# Read dataset

%%time
data_mart_path = 'data_mart.csv'

try:
    data_mart_df = pd.read_csv(data_mart_path)
except pd.errors.ParserError as e:
    print(f'Error while parsing CSV file: {e}')



data_mart_df.shape



CPU times: user 1.29 s, sys: 244 ms, total: 1.53 s
Wall time: 1.67 s


(101350, 104)

In [20]:
data_mart_df.total_rev_monthly_avg.min(), data_mart_df.total_rev_monthly_avg.mean(), data_mart_df.total_rev_monthly_avg.max()

(-381, 60.54722249629995, 34584)

In [23]:
# Create bins for 'total_rev_monthly_avg'
bins = [data_mart_df['total_rev_monthly_avg'].min(),
        data_mart_df['total_rev_monthly_avg'].mean(),
        #data_mart_df['total_rev_monthly_avg'].mean() + data_mart_df['total_rev_monthly_avg'].std(),
        data_mart_df['total_rev_monthly_avg'].max()]

# Split dataset acording to the bins
bins_df = pd.cut(data_mart_df['total_rev_monthly_avg'], bins=bins)

# Create groups with bins
data_set_size = data_mart_df.groupby(bins_df).size().reset_index(name='count')

print("Bins and sizes:")
print(data_set_size)


Bins and sizes:
  total_rev_monthly_avg  count
0      (-381.0, 60.547]  78807
1     (60.547, 34584.0]  22542


In [28]:
print('Number of customers who made churned and had monthy average revenue less than average = ',
      data_mart_df[(data_mart_df.churned == 1) & (data_mart_df.total_rev_monthly_avg <
                                                  data_mart_df.total_rev_monthly_avg.mean())].shape[0])
print('Number of customers who made churned and had monthy average revenue more than average = ',
      data_mart_df[(data_mart_df.churned == 1) & (data_mart_df.total_rev_monthly_avg >=
                                                  data_mart_df.total_rev_monthly_avg.mean())].shape[0])
print('Number of customers who has not made churned and had monthy average revenue less than average = ',
      data_mart_df[(data_mart_df.churned == 0) & (data_mart_df.total_rev_monthly_avg <
                                                  data_mart_df.total_rev_monthly_avg.mean())].shape[0])
print('Number of customers who has not made churned and had monthy average revenue more than average = ',
      data_mart_df[(data_mart_df.churned == 0) & (data_mart_df.total_rev_monthly_avg >=
                                                  data_mart_df.total_rev_monthly_avg.mean())].shape[0])

Number of customers who made churned and had monthy average revenue less than average =  50190
Number of customers who made churned and had monthy average revenue more than average =  7060
Number of customers who has not made churned and had monthy average revenue less than average =  28618
Number of customers who has not made churned and had monthy average revenue more than average =  15482
