In [1]:
# Import Packages
import numpy as np
import pandas as pd
import pyodbc

# Get Current time
import datetime 

# For Graphics
import matplotlib as matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# For splitting data into test and train subsets
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV

# For logistic regression
from sklearn.linear_model import LogisticRegression

# For saving the model
from sklearn.externals import joblib 

# For Confusion Matrix
from sklearn import metrics

# For ROC Curve
from sklearn.metrics import roc_auc_score, roc_curve

# For Normalization
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# For Feature Selection
from sklearn.feature_selection import chi2, SelectKBest

# Settings to view all columns and rows
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

#to ignore the warning messages from being printed
import warnings
warnings.filterwarnings('ignore')



In [2]:
# Import Dataset
# Explicitly setting data type for columns which are string data type  

df_orig = pd.read_csv('agent_churn.csv')

In [3]:
# Create a copy of dataframe
df_work = df_orig.copy()

In [4]:
# Separate features into categorical and numerical category
cat_vars = ['Commission Class','Pay Method','Pay Frequency','Agent Type','Agent Branch Code',
            'Agent Reporting Level','Agent Gender']

num_vars = ['Number_of_Policies', 'Number_of_Claims', 'Total_Claim_Amount','Total_Payment_Amount','Total_Commission_Amount']

date_vars = ['Date_of_Appointment', 'Date_of_Termination', 'Agent Date of Birth' ]

In [5]:
# Convert date columns

for var in date_vars:
    #df_work[var] = df_work[var].astype('category')
    #df_work[var] = datetime.datetime.strptime(var, '%Y-%m-%d')
    df_work[var]=pd.to_datetime(df_work[var].astype(str), format='%Y%m%d')

In [6]:
# Convert to 'category' object type
for var in cat_vars:
    df_work[var] = df_work[var].astype('category')

In [7]:
# Create Dummy Variables for categorical variables
for var in cat_vars:
    cat_list = 'var'+'_'+var
    cat_list = pd.get_dummies(df_work[var], prefix=var)
    dummy_data = df_work.join(cat_list)
    df_work = dummy_data

In [8]:
# Remove original columns for which dummy variables have been created
vars_remove = ['Commission Class','Pay Method','Pay Frequency','Agent Type','Agent Branch Code',
            'Agent Reporting Level','Agent Gender']

for var in vars_remove:
    df_work=df_work.drop(var, axis=1)

In [9]:
# Normalize numerical columns
scaler = MinMaxScaler()
df_work['Number_of_Policies'] = scaler.fit_transform(df_work['Number_of_Policies'].values.reshape(-1,1))
df_work['Number_of_Claims'] = scaler.fit_transform(df_work['Number_of_Claims'].values.reshape(-1,1))
df_work['Total_Claim_Amount'] = scaler.fit_transform(df_work['Total_Claim_Amount'].values.reshape(-1,1))
df_work['Total_Payment_Amount'] = scaler.fit_transform(df_work['Total_Payment_Amount'].values.reshape(-1,1))
df_work['Total_Commission_Amount'] = scaler.fit_transform(df_work['Total_Commission_Amount'].values.reshape(-1,1))

In [10]:
# Get current date
current_time = datetime.datetime.now() 

In [11]:
# Create a new column 'Age' on the basis of date of birth of agent
df_work['age'] = current_time - df_work['Agent Date of Birth']

In [12]:
# Create a new column named 'service period' on the basis of difference between Agent Date of Registration and Termination.
# In case date of Termination is null we will replace it with current date and compute

service_period=[]

for Date_of_Termination,Date_of_Appointment in zip(df_work.iloc[:,2], df_work.iloc[:,1]):
    if Date_of_Termination is pd.NaT:
        service_period.append(abs(current_time-Date_of_Appointment))
    else:
        service_period.append(abs(Date_of_Termination-Date_of_Appointment))
    #print(Date_of_Termination)
    #print(Date_of_Appointment)


In [13]:
# Append the new column to dataframe
df_work['Service_Period']=service_period

# Remove the orginal columns
df_work=df_work.drop('Date_of_Appointment', axis=1)
df_work=df_work.drop('Date_of_Termination', axis=1)
df_work=df_work.drop('Agent Date of Birth', axis=1)

# Remove Agent Number since it is not critical for prediction
df_work=df_work.drop('Agent Number', axis=1)

In [14]:
# Normalize the new columns 
scaler = MinMaxScaler()
df_work['age'] = scaler.fit_transform(df_work['age'].values.reshape(-1,1))
df_work['Service_Period'] = scaler.fit_transform(df_work['Service_Period'].values.reshape(-1,1))

In [15]:
# Create separate dataset for feature and target columns
x = df_work.drop('Churn', axis=1)
y = df_work['Churn']

In [17]:
# Perform Test/Train Split
#x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.3,random_state=11)
#x_train.shape, x_test.shape, y_train.shape, y_test.shape

((2541, 32), (1090, 32), (2541,), (1090,))

In [19]:
# To handle class imbalance performing SMOTE

from imblearn.over_sampling import SMOTE
SMO = SMOTE(random_state=11)

#X4_train, X4_test, Y4_train, Y4_test = train_test_split(X_train,Y_train, test_size=0.20)
x_smo,y_smo = SMO.fit_sample(x, y)
columns = x_train.columns

x_smo = pd.DataFrame(data = x_smo, columns = columns )
y_smo = pd.DataFrame(data = y_smo, columns = ['Churn'])

# we can Check the numbers of our data
print("length of oversampled data is ",len(x_smo))
print("Number of Churned agents in oversampled data",len(y_smo[y_smo['Churn']== 0 ]))
print("Number of Non Churned agents in oversampled data",len(y_smo[y_smo['Churn']== 1 ]))
print("Proportion of Non Churned agents in oversampled data is ",len(y_smo[y_smo['Churn']== 0])/len(x_smo))
print("Proportion of Churned Agents in oversampled data is ",len(y_smo[y_smo['Churn']== 1 ])/len(x_smo))

length of oversampled data is  7238
Number of Churned agents in oversampled data 3619
Number of Non Churned agents in oversampled data 3619
Proportion of Non Churned agents in oversampled data is  0.5
Proportion of Churned Agents in oversampled data is  0.5


In [21]:
bestfeatures=SelectKBest(score_func=chi2, k=10)
fit=bestfeatures.fit(x_smo,y_smo)
dfscores=pd.DataFrame(fit.scores_)
dfcolumns=pd.DataFrame(x_smo.columns)
#concatenate two dataframes
featureScores=pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns=['Specs','Score'] #naming the dataframe columns
print(featureScores.nlargest(10,'Score')) #print 10 best features
top10 = featureScores.nlargest(10,'Score')  #select the 10 best features with scores
top_features = top10['Specs']               #10 best feature cols 
top_features


                      Specs       Score
14            Agent Type_AM  421.830701
0        Number_of_Policies  386.071018
1          Number_of_Claims  280.271562
18            Agent Type_BR  251.136585
27           Agent Gender_   201.000000
4   Total_Commission_Amount  199.688720
5        Commission Class_1  187.000000
6        Commission Class_2  160.370031
28           Agent Gender_F  143.000000
2        Total_Claim_Amount  137.999453


14              Agent Type_AM
0          Number_of_Policies
1            Number_of_Claims
18              Agent Type_BR
27             Agent Gender_ 
4     Total_Commission_Amount
5          Commission Class_1
6          Commission Class_2
28             Agent Gender_F
2          Total_Claim_Amount
Name: Specs, dtype: object

In [22]:
# Create new dataset with only top features
x01=x[top_features]
y01=y

In [25]:
# Perform Test/Train Split
x01_train, x01_test, y01_train, y01_test=train_test_split(x_smo, y_smo, test_size=0.3,random_state=11)
x01_train.shape, x01_test.shape, y01_train.shape, y01_test.shape

((5066, 32), (2172, 32), (5066, 1), (2172, 1))

In [24]:
# Define grid
C = np.logspace(0,5,10)
param_grid = dict( C=C)

# Model
logreg=LogisticRegression(solver='lbfgs',max_iter=1000)

# Define Evaluation Procedure
cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=11)

# Define Grid Search
grid=GridSearchCV(estimator=logreg, param_grid=param_grid, n_jobs=1, cv=cv, scoring='roc_auc')

# Execute Grid Search
grid_result=grid.fit(x01_train, (y01_train_smo.values.ravel()))

NameError: name 'x01_train_smo' is not defined