In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [2]:
df = pd.read_csv('customer_data.csv')
df.columns = [col.lower() for col in df.columns]
df.columns = df.columns.str.replace(' ', '_')
df.drop(columns=['customerid', 'count'], axis = 1, inplace=True)

In [3]:
y = df['churn_value']
df.drop(columns=['churn_value', 'churn_label', 'churn_score', 'cltv', 'churn_reason'], inplace=True)
df.drop(columns=['zip_code', 'lat_long', 'latitude', 'longitude'], inplace=True)
X = df.loc[:]

In [4]:
oe = OrdinalEncoder()
oe.fit_transform(X)
for col in X.columns:
    if X.columns.dtype == 'object':
        X[col] = oe.fit_transform(X[[col]])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)
X.shape

(7043, 22)

In [6]:
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1), 
          k_features=(1, 9),
          forward=False,
          floating=False,
          verbose=2,
          scoring="accuracy",
          cv=4,
          n_jobs=-1
         ).fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  22 | elapsed:    2.3s remaining:   15.1s
[Parallel(n_jobs=-1)]: Done  15 out of  22 | elapsed:    2.9s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done  22 out of  22 | elapsed:    3.6s finished

[2021-11-27 17:57:16] Features: 21/1 -- score: 0.8042246425172591[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  21 | elapsed:    2.3s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:    3.2s finished

[2021-11-27 17:57:19] Features: 20/1 -- score: 0.805821267702755[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:    2.3s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    3.0s finished

[2021-11-27 17:57:22] Features: 19/1 -- score: 0.8035142892283373[Parallel(n_jobs=-1)]: Using backen

In [7]:
sfs.k_feature_names_

('city',
 'gender',
 'dependents',
 'tenure_months',
 'online_security',
 'streaming_movies',
 'paperless_billing',
 'payment_method',
 'monthly_charges')

In [8]:
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
22,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0.8019872249822569, 0.7977288857345636, 0.805...",0.799254,"(country, state, city, gender, senior_citizen,...",0.0080777,0.00503917,0.00290937
21,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0.8034066713981547, 0.8034066713981547, 0.816...",0.804225,"(country, state, city, gender, senior_citizen,...",0.0125466,0.00782706,0.00451896
20,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0.8161816891412349, 0.7984386089425124, 0.811...",0.805821,"(country, state, city, gender, senior_citizen,...",0.0133515,0.00832918,0.00480885
19,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 16,...","[0.8090844570617459, 0.7984386089425124, 0.806...",0.803514,"(country, state, city, gender, senior_citizen,...",0.00726368,0.00453135,0.00261618
18,"(0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 13, 16, 17...","[0.8026969481902059, 0.801277501774308, 0.8146...",0.803692,"(country, state, city, gender, senior_citizen,...",0.0108463,0.00676635,0.00390655
17,"(0, 1, 2, 3, 4, 6, 7, 9, 10, 11, 13, 16, 17, 1...","[0.808374733853797, 0.7970191625266146, 0.8125...",0.80458,"(country, state, city, gender, senior_citizen,...",0.00986671,0.00615523,0.00355372
16,"(0, 1, 2, 3, 4, 6, 7, 9, 10, 11, 13, 16, 18, 1...","[0.8126330731014905, 0.7970191625266146, 0.803...",0.802981,"(country, state, city, gender, senior_citizen,...",0.00987015,0.00615737,0.00355496
15,"(0, 1, 2, 3, 4, 6, 7, 10, 11, 13, 16, 18, 19, ...","[0.8034066713981547, 0.7998580553584103, 0.812...",0.803693,"(country, state, city, gender, senior_citizen,...",0.00856969,0.0053461,0.00308657
14,"(0, 1, 2, 3, 6, 7, 10, 11, 13, 16, 18, 19, 20,...","[0.8076650106458482, 0.8019872249822569, 0.799...",0.801561,"(country, state, city, gender, dependents, ten...",0.00619677,0.00386578,0.00223191
13,"(0, 1, 2, 3, 6, 7, 10, 11, 16, 18, 19, 20, 21)","[0.801277501774308, 0.7927608232789212, 0.8068...",0.800143,"(country, state, city, gender, dependents, ten...",0.00803588,0.00501309,0.00289431


In [9]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [10]:
efs = EFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1),
         min_features=1,
         max_features=2,
         scoring="accuracy",
         cv = None,
         n_jobs=-1,
         ).fit(X_train, y_train)

Features: 253/253

In [12]:
efs.best_score_

0.9996450124245652

In [14]:
efs.best_feature_names_

('city', 'total_charges')