In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
import matplotlib
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_selector

matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "xelatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

In [2]:
churn = pd.read_csv('Churn_Modelling.csv', index_col=0)

In [3]:
plt.figure()
sn.heatmap(churn.corr())

<AxesSubplot:>

In [4]:
churn_clean = churn.copy()

In [5]:
categorical_cols = {'Geography', 'Gender'}
predict_cols = set((churn.columns)) - {'CustomerId', 'Exited', 'Surname'}
num_cols = predict_cols- categorical_cols

In [6]:
sclr = StandardScaler()
lblr = LabelEncoder()
sclr.fit(churn[num_cols])
for col in categorical_cols:
    churn_clean[col] = lblr.fit_transform(churn[col])
churn_clean[list(num_cols)] = sclr.transform(churn[num_cols])

In [7]:
plt.clf()
plt.cla()
plt.figure()
plt.title('Correlation Matrix')
# sn.set(font_scale=.3)
# sn.set(rc = {'figure.figsize':(2,2)})
plt.subplots_adjust(left=0.2, bottom=0.3)
sn.heatmap(churn_clean[predict_cols].corr())
plt.savefig('corr.pgf', format='pgf')

In [8]:
churn_clean.head(2)

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,15634602,Hargrave,-0.326221,0,0,0.293517,-1.04176,-1.225848,-0.911583,0.646092,0.970243,0.021886,1
2,15647311,Hill,-0.440036,2,0,0.198164,-1.387538,0.11735,-0.911583,-1.547768,0.970243,0.216534,0


In [9]:
pca = PCA().fit(churn_clean[predict_cols])

In [10]:
SMALL_SIZE = 12
MEDIUM_SIZE = 12
BIGGER_SIZE = 14

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
sn.set(font=)

SyntaxError: invalid syntax (659848046.py, line 12)

In [11]:
plt.clf()
plt.cla()
plt.figure()
plt.xticks(np.arange(pca.n_components_) + 1)
plt.title('Explained Variance')
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
# plt.subplots_adjust(left=0.3, right=0.9, bottom=0.3, top=0.9)
plt.savefig('explained_variance.pgf',format='pgf')

In [13]:
plt.clf()
plt.cla()
plt.figure()
plt.xticks(list(range(len(pca.components_))))
PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_, linewidth=.5, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal component')
plt.ylabel('Explained variance')
# plt.subplots_adjust(left=0.3, right=0.9, bottom=0.3, top=0.9)
plt.savefig('scree.pgf',format='pgf')

In [None]:
pca.explained_variance_ratio_

In [None]:
churn_clean.head(1)

In [None]:
#########################
### FEATURE SELECTION ###
#########################

from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

### Initialize Boruta
forest = RandomForestClassifier(
   n_jobs = -1, 
   max_depth = 7, 
   verbose = 0
)
boruta = BorutaPy(
   estimator = forest, 
   n_estimators = 'auto',
   max_iter = 500, # number of trials to perform, 
   verbose = 0
)
### modify datatype for Boruta (it accepts np.array, not pd.DataFrame)

## NOTE: Omitted Surname and Customer ID
churn_data_x = churn_clean[list(predict_cols)]
churn_data_y = churn_clean[['Exited']]

churn_data_x_numpy = churn_data_x.to_numpy()
churn_data_y_numpy = churn_data_y.to_numpy()

## Boruta has already been run so for future runs of this notebook we avoid a re-run
## To re-run Boruta, switch runBortua to True
runBoruta = False

if runBoruta:
    boruta.fit(churn_data_x_numpy, churn_data_y_numpy)

    ## Green Area variables have been cleared as significant, blue area variables are still uncertain
    green_area = churn_data_x.columns[boruta.support_].to_list()
    blue_area = churn_data_x.columns[boruta.support_weak_].to_list()
    print('features in the green area:', green_area)
    print('features in the blue area:', blue_area)
else:
   green_area = ['Age', 'EstimatedSalary', 'CreditScore', 'Geography', 'NumOfProducts', 'IsActiveMember', 'Balance']
   blue_area = []

In [None]:
###########################################
### MODEL IMPLEMENTATION, RANDOM FOREST ###
###########################################

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
import seaborn as sns

x_train, x_test, y_train, y_test = train_test_split(churn_data_x[green_area], churn_data_y, test_size = 0.2, random_state = 20)

## Random Grid search has already been run, to re-run, turn runRandomGrid to True
runRandomGrid = False 

if runRandomGrid:
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 500, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 25, num = 10)]
    max_depth.append(None)
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4, 10]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}

    # Use the random grid to search for best hyperparameters
    rf = RandomForestClassifier(class_weight='balanced')
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=10, random_state=42, n_jobs = -1)

    rf_random.fit(x_train, y_train)
    rf_random.score(x_test, y_test)

    ### Best Params ###
    '''
    {'n_estimators': 1000,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'max_depth': 16,
    'bootstrap': False}
    '''

classifier = RandomForestClassifier(min_samples_leaf = 1, max_features = 'sqrt', max_depth=16, bootstrap = False, n_estimators = 1000, random_state= 1, class_weight='balanced')
classifier.fit(x_train, y_train)

print('------------------------------------------------')
print('Accuracy :', classifier.score(x_test,y_test))

y_preds = classifier.predict(x_test)
f1_score = metrics.f1_score(y_test, y_preds)
print('F1 :', f1_score)
print('------------------------------------------------')


for i,v in enumerate(classifier.feature_importances_):
	print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
###########################################
### MODEL IMPLEMENTATION, RANDOM FOREST ###
###########################################

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
import seaborn as sns

x_train, x_test, y_train, y_test = train_test_split(churn_data_x[green_area], churn_data_y, test_size = 0.2, random_state = 20)

## Random Grid search has already been run, to re-run, turn runRandomGrid to True
runRandomGrid = False 

if runRandomGrid:
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 500, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 25, num = 10)]
    max_depth.append(None)
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4, 10]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}

    # Use the random grid to search for best hyperparameters
    rf = RandomForestClassifier(class_weight='balanced')
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=10, random_state=42, n_jobs = -1)

    rf_random.fit(x_train, y_train)
    rf_random.score(x_test, y_test)

    ### Best Params ###
    '''
    {'n_estimators': 1000,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'max_depth': 16,
    'bootstrap': False}
    '''

classifier = RandomForestClassifier(min_samples_leaf = 1, max_features = 'sqrt', max_depth=16, bootstrap = False, n_estimators = 1000, random_state= 1, class_weight='balanced')
classifier.fit(x_train, y_train)

print('------------------------------------------------')
print('Accuracy :', classifier.score(x_test,y_test))

y_preds = classifier.predict(x_test)
f1_score = metrics.f1_score(y_test, y_preds)
print('F1 :', f1_score)
print('------------------------------------------------')


for i,v in enumerate(classifier.feature_importances_):
	print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
###########################################
### MODEL IMPLEMENTATION, RANDOM FOREST ###
###########################################

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
import seaborn as sns

x_train, x_test, y_train, y_test = train_test_split(churn_data_x[green_area], churn_data_y, test_size = 0.2, random_state = 20)

## Random Grid search has already been run, to re-run, turn runRandomGrid to True
runRandomGrid = False 

if runRandomGrid:
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 500, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 25, num = 10)]
    max_depth.append(None)
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4, 10]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}

    # Use the random grid to search for best hyperparameters
    rf = RandomForestClassifier(class_weight='balanced')
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=10, random_state=42, n_jobs = -1)

    rf_random.fit(x_train, y_train)
    rf_random.score(x_test, y_test)

    ### Best Params ###
    '''
    {'n_estimators': 1000,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'max_depth': 16,
    'bootstrap': False}
    '''

classifier = RandomForestClassifier(min_samples_leaf = 1, max_features = 'sqrt', max_depth=16, bootstrap = False, n_estimators = 1000, random_state= 1, class_weight='balanced')
classifier.fit(x_train, y_train)

print('------------------------------------------------')
print('Accuracy :', classifier.score(x_test,y_test))

y_preds = classifier.predict(x_test)
f1_score = metrics.f1_score(y_test, y_preds)
print('F1 :', f1_score)
print('------------------------------------------------')


for i,v in enumerate(classifier.feature_importances_):
	print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
featureImportanceDict = {'Features':green_area,'Importance':list(classifier.feature_importances_)}
featureImportanceDictDF = pd.DataFrame(featureImportanceDict, columns=['Features','Importance'])
plt.clf()
plt.cla()
sns.set(rc = {'figure.figsize':(8,4)})
plt.figure()
sns.barplot(y='Features', x='Importance', data=featureImportanceDictDF).set_title('Feature Importance')
plt.subplots_adjust(left=0.2)
plt.savefig('feature_importances.pgf',format='pgf')