In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [6]:
df = pd.read_csv('customer_data.csv')
df.columns = [col.lower() for col in df.columns]
df.columns = df.columns.str.replace(' ', '_')
df.drop(columns=['customerid', 'count'], axis = 1, inplace=True)

In [7]:
y = df['churn_value']
df.drop(columns=['churn_value', 'churn_label', 'churn_score', 'cltv', 'churn_reason'], inplace=True)
df.drop(columns=['zip_code', 'lat_long', 'latitude', 'longitude', 'city', 'country'], inplace=True)
X = df.loc[:]

In [8]:
oe = OrdinalEncoder()
oe.fit_transform(X)
for col in X.columns:
    if X.columns.dtype == 'object':
        X[col] = oe.fit_transform(X[[col]])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [10]:
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1), 
          k_features=(1,10),
          forward=False,
          floating=False,
          verbose=2,
          scoring="accuracy",
          cv=3,
          n_jobs=-1
         ).fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:    1.9s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    2.5s finished

[2021-12-03 13:03:09] Features: 19/1 -- score: 0.7965921192758253[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  19 | elapsed:    1.4s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done  19 out of  19 | elapsed:    2.0s finished

[2021-12-03 13:03:11] Features: 18/1 -- score: 0.7962371317003906[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  18 | elapsed:    1.5s remaining:    2.4s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    2.1s finished

[2021-12-03 13:03:13] Features: 17/1 -- score: 0.7962371317003906[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of

In [11]:
sfs.k_feature_names_

('gender',
 'senior_citizen',
 'dependents',
 'tenure_months',
 'online_security',
 'streaming_movies',
 'contract',
 'payment_method',
 'monthly_charges',
 'total_charges')

In [19]:
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
20,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0.7944621938232161, 0.7960596379126731, 0.790...",0.793575,"(state, gender, senior_citizen, partner, depen...",0.00556327,0.0024722,0.00174811
19,"(0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14...","[0.7949946751863685, 0.7933972310969116, 0.801...",0.796592,"(state, gender, senior_citizen, partner, depen...",0.0077656,0.00345087,0.00244014
18,"(0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 1...","[0.7907348242811502, 0.7944621938232161, 0.803...",0.796237,"(state, gender, senior_citizen, partner, depen...",0.0120754,0.00536607,0.00379438
17,"(0, 1, 2, 3, 4, 5, 8, 9, 11, 12, 13, 14, 15, 1...","[0.7870074547390842, 0.7976570820021299, 0.804...",0.796237,"(state, gender, senior_citizen, partner, depen...",0.0158162,0.0070284,0.00496983
16,"(0, 1, 2, 4, 5, 8, 9, 11, 12, 13, 14, 15, 16, ...","[0.8003194888178914, 0.7971246006389776, 0.794...",0.797302,"(state, gender, senior_citizen, dependents, te...",0.00538846,0.00239452,0.00169318
15,"(0, 1, 2, 4, 5, 8, 9, 11, 13, 14, 15, 16, 17, ...","[0.7912673056443025, 0.7939297124600639, 0.798...",0.794462,"(state, gender, senior_citizen, dependents, te...",0.00641563,0.00285097,0.00201594
14,"(0, 1, 2, 4, 5, 9, 11, 13, 14, 15, 16, 17, 18,...","[0.7907348242811502, 0.7902023429179978, 0.795...",0.792155,"(state, gender, senior_citizen, dependents, te...",0.00538846,0.00239452,0.00169318
13,"(1, 2, 4, 5, 9, 11, 13, 14, 15, 16, 17, 18, 19)","[0.792332268370607, 0.7907348242811502, 0.7917...",0.791622,"(gender, senior_citizen, dependents, tenure_mo...",0.00149449,0.000664121,0.000469604
12,"(1, 2, 4, 5, 9, 13, 14, 15, 16, 17, 18, 19)","[0.7907348242811502, 0.7955271565495208, 0.794...",0.793575,"(gender, senior_citizen, dependents, tenure_mo...",0.00462361,0.00205464,0.00145285
11,"(1, 2, 4, 5, 9, 14, 15, 16, 17, 18, 19)","[0.7928647497337593, 0.7896698615548455, 0.793...",0.791977,"(gender, senior_citizen, dependents, tenure_mo...",0.00370406,0.00164601,0.0011639


In [40]:
type(X)

pandas.core.frame.DataFrame

In [45]:
new_features = pd.DataFrame.from_records(X, columns =sfs.k_feature_names_)
new_features

Unnamed: 0,gender,senior_citizen,dependents,tenure_months,online_security,streaming_movies,contract,payment_method,monthly_charges,total_charges
0,1.0,0.0,0.0,2.0,2.0,0.0,0.0,3.0,436.0,157.0
1,0.0,0.0,1.0,2.0,0.0,0.0,0.0,2.0,729.0,925.0
2,0.0,0.0,1.0,8.0,0.0,2.0,0.0,2.0,1274.0,6104.0
3,0.0,0.0,1.0,28.0,0.0,2.0,0.0,2.0,1371.0,2646.0
4,1.0,0.0,1.0,49.0,0.0,2.0,0.0,0.0,1349.0,4265.0
...,...,...,...,...,...,...,...,...,...,...
7038,0.0,0.0,0.0,72.0,1.0,1.0,2.0,0.0,52.0,770.0
7039,1.0,0.0,1.0,24.0,2.0,2.0,1.0,3.0,991.0,1597.0
7040,0.0,0.0,1.0,72.0,0.0,2.0,1.0,1.0,1340.0,5698.0
7041,0.0,0.0,1.0,11.0,2.0,0.0,0.0,2.0,137.0,2994.0


In [64]:
X_train, X_test, y_train, y_test = train_test_split(new_features, y, test_size = 0.2, random_state=0)

In [65]:
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.7785663591199432