In [1]:
# Import Packages
import numpy as np
import pandas as pd
import pyodbc

# Get Current time
import datetime 

# For Graphics
import matplotlib as matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# For splitting data into test and train subsets
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV

# For logistic regression
from sklearn.linear_model import LogisticRegression

# For saving the model
from sklearn.externals import joblib 

# For Confusion Matrix
from sklearn import metrics

# For ROC Curve
from sklearn.metrics import roc_auc_score, roc_curve

# For Normalization
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# For Pipeline preparation
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

# For Feature Selection
from sklearn.feature_selection import chi2, SelectKBest

# Settings to view all columns and rows
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

#to ignore the warning messages from being printed
import warnings
warnings.filterwarnings('ignore')



In [14]:
# Import Dataset
# Explicitly setting data type for columns which are string data type  

df_orig = pd.read_csv('agent_churn.csv')

In [15]:
# Check data sample
df_orig.head()

Unnamed: 0,Agent Number,Date_of_Appointment,Date_of_Termination,Commission Class,Pay Method,Pay Frequency,Agent Type,Agent Branch Code,Agent Reporting Level,Agent Date of Birth,Agent Gender,Number_of_Policies,Number_of_Claims,Total_Claim_Amount,Total_Payment_Amount,Total_Commission_Amount,Churn
0,50001187,20150526,,3,DC,12,AM,10,1,19910507.0,M,5,0,0.0,0.0,0.0,0
1,50002503,20151214,,3,CQ,12,AG,10,1,19830404.0,M,4,0,0.0,0.0,0.0,0
2,50004847,20200101,,3,CQ,12,BR,10,1,19800101.0,M,0,0,0.0,0.0,0.0,0
3,50040992,20230104,,3,CQ,12,AG,10,1,19750101.0,M,1,0,0.0,0.0,0.0,0
4,50041143,20230101,,2,CQ,12,AM,10,1,19880101.0,M,0,0,0.0,0.0,0.0,0


# Data Analysis

In [16]:
# Check data shape
df_orig.shape

(3631, 17)

In [17]:
# Check if there are any nulls
df_orig.isnull().sum()

Agent Number                  0
Date_of_Appointment           0
Date_of_Termination        3619
Commission Class              0
Pay Method                    0
Pay Frequency                 0
Agent Type                    0
Agent Branch Code             0
Agent Reporting Level         0
Agent Date of Birth          32
Agent Gender                  0
Number_of_Policies            0
Number_of_Claims              0
Total_Claim_Amount            0
Total_Payment_Amount          0
Total_Commission_Amount       0
Churn                         0
dtype: int64

In [18]:
# Check data attributes of the columns
df_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3631 entries, 0 to 3630
Data columns (total 17 columns):
Agent Number               3631 non-null int64
Date_of_Appointment        3631 non-null int64
Date_of_Termination        12 non-null float64
Commission Class           3631 non-null int64
Pay Method                 3631 non-null object
Pay Frequency              3631 non-null int64
Agent Type                 3631 non-null object
Agent Branch Code          3631 non-null int64
Agent Reporting Level      3631 non-null int64
Agent Date of Birth        3599 non-null float64
Agent Gender               3631 non-null object
Number_of_Policies         3631 non-null int64
Number_of_Claims           3631 non-null int64
Total_Claim_Amount         3631 non-null float64
Total_Payment_Amount       3631 non-null float64
Total_Commission_Amount    3631 non-null float64
Churn                      3631 non-null int64
dtypes: float64(5), int64(9), object(3)
memory usage: 482.3+ KB


In [31]:
df_orig['Agent Gender'].value_counts()

M    3287
      201
F     143
Name: Agent Gender, dtype: int64

# Data Treatment

In [19]:
# Create a copy of dataframe
df_work = df_orig.copy()

In [20]:
# Separate features into categorical and numerical category
#cat_vars = df_work.select_dtypes(include=['object']).copy()

#num_vars = df_work.select_dtypes(exclude=['object']).copy()
#num_vars = num_vars.drop('Date_of_Appointment', axis=1)
#num_vars = num_vars.drop('Date_of_Termination', axis=1)
#num_vars = num_vars.drop('Agent_Date_of_Birth', axis=1)

date_vars = ['Date_of_Appointment', 'Date_of_Termination', 'Agent Date of Birth' ]

In [21]:
# Convert date columns

for var in date_vars:
    #df_work[var] = df_work[var].astype('category')
    #df_work[var] = datetime.datetime.strptime(var, '%Y-%m-%d')
    df_work[var]=pd.to_datetime(df_work[var].astype(str), format='%Y%m%d')

In [32]:
# Get current date
current_time = datetime.datetime.now() 

In [33]:
# Create a new column 'Age' on the basis of date of birth of agent
df_work['age'] = current_time - df_work['Agent Date of Birth']

In [34]:
# Create a new column named 'service period' on the basis of difference between Agent Date of Registration and Termination.
# In case date of Termination is null we will replace it with current date and compute

service_period=[]

for Date_of_Termination,Date_of_Appointment in zip(df_work.iloc[:,2], df_work.iloc[:,1]):
    if Date_of_Termination is pd.NaT:
        service_period.append(abs(current_time-Date_of_Appointment))
    else:
        service_period.append(abs(Date_of_Termination-Date_of_Appointment))
    #print(Date_of_Termination)
    #print(Date_of_Appointment)


In [35]:
# Append the new column to dataframe
df_work['Service_Period']=service_period

# Remove the orginal columns
df_work=df_work.drop('Date_of_Appointment', axis=1)
df_work=df_work.drop('Date_of_Termination', axis=1)
df_work=df_work.drop('Agent Date of Birth', axis=1)

# Remove Agent Number since it is not critical for prediction
df_work=df_work.drop('Agent Number', axis=1)

In [36]:
df_work.head()

Unnamed: 0,Commission Class,Pay Method,Pay Frequency,Agent Type,Agent Branch Code,Agent Reporting Level,Agent Gender,Number_of_Policies,Number_of_Claims,Total_Claim_Amount,Total_Payment_Amount,Total_Commission_Amount,Churn,age,Service_Period
0,3,DC,12,AM,10,1,M,5,0,0.0,0.0,0.0,0,10588 days 11:48:12.942347,1803 days 11:48:12.942347
1,3,CQ,12,AG,10,1,M,4,0,0.0,0.0,0.0,0,13543 days 11:48:12.942347,1601 days 11:48:12.942347
2,3,CQ,12,BR,10,1,M,0,0,0.0,0.0,0.0,0,14732 days 11:48:12.942347,122 days 11:48:12.942347
3,3,CQ,12,AG,10,1,M,1,0,0.0,0.0,0.0,0,16558 days 11:48:12.942347,976 days 12:11:47.057653
4,2,CQ,12,AM,10,1,M,0,0,0.0,0.0,0.0,0,11810 days 11:48:12.942347,973 days 12:11:47.057653


In [39]:
df_work.shape

(3631, 15)

In [43]:
df_work.columns

Index(['Commission Class', 'Pay Method', 'Pay Frequency', 'Agent Type',
       'Agent Branch Code', 'Agent Reporting Level', 'Agent Gender',
       'Number_of_Policies', 'Number_of_Claims', 'Total_Claim_Amount',
       'Total_Payment_Amount', 'Total_Commission_Amount', 'Churn', 'age',
       'Service_Period'],
      dtype='object')

In [38]:
# Define encoder, imputer and classifier to be used in the pipeline
ohe = OneHotEncoder()
imp = SimpleImputer()
clf = LogisticRegression()

In [57]:
# trial
cat_vars = df_work.select_dtypes(include=['object']).copy()
cat_vars

Unnamed: 0,Pay Method,Agent Type,Agent Gender
0,DC,AM,M
1,CQ,AG,M
2,CQ,BR,M
3,CQ,AG,M
4,CQ,AM,M
5,CQ,AM,M
6,CQ,BR,M
7,CQ,BR,M
8,CQ,AG,M
9,DC,BM,M


In [61]:
# df_ct is the column transformed df_work
df_ct = make_column_transformer(
    (imp, ['Agent Gender']), 
    (ohe, ['Pay Method', 'Agent Type', 'Agent Gender']),
    remainder='passthrough')

In [62]:
# Prepare the pipeline with the column transformed df_ct and the model(LogReg)
pipe = make_pipeline(df_ct, clf)

In [68]:
# Create separate dataset for feature and target columns
x01 = df_work.drop('Churn', axis=1)
y01 = df_work['Churn']

AttributeError: 'ColumnTransformer' object has no attribute 'drop'

In [65]:
# Perform Test/Train Split
x01_train, x01_test, y01_train, y01_test=train_test_split(x01, y01, test_size=0.3,random_state=11)
x01_train.shape, x01_test.shape, y01_train.shape, y01_test.shape

((2541, 14), (1090, 14), (2541,), (1090,))

In [67]:
# To handle class imbalance performing SMOTE

from imblearn.over_sampling import SMOTE
SMO = SMOTE(random_state=11)

#X4_train, X4_test, Y4_train, Y4_test = train_test_split(X_train,Y_train, test_size=0.20)
x01_train_smo,y01_train_smo = SMO.fit_sample(x01_train, y01_train)
columns = x01_train.columns

x01_train_smo = pd.DataFrame(data = x01_train_smo, columns = columns )
y01_train_smo = pd.DataFrame(data = y01_train_smo, columns = ['Churn'])

# we can Check the numbers of our data
print("length of oversampled data is ",len(x01_train_smo))
print("Number of Churned agents in oversampled data",len(y01_train_smo[y01_train_smo['Churn']== 0 ]))
print("Number of Non Churned agents in oversampled data",len(y01_train_smo[y01_train_smo['Churn']== 1 ]))
print("Proportion of Non Churned agents in oversampled data is ",len(y01_train_smo[y01_train_smo['Churn']== 0])/len(x01_train_smo))
print("Proportion of Churned Agents in oversampled data is ",len(y01_train_smo[y01_train_smo['Churn']== 1 ])/len(x01_train_smo))

ValueError: could not convert string to float: 'CQ'