In [1]:
import pandas as pd

In [5]:
churn_data = pd.read_csv('https://raw.githubusercontent.com/zekelabs/data-science-complete-tutorial/master/Data/churn.csv.txt',
                         parse_dates=['last_trip_date','signup_date'] )

In [6]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
avg_dist                  50000 non-null float64
avg_rating_by_driver      49799 non-null float64
avg_rating_of_driver      41878 non-null float64
avg_surge                 50000 non-null float64
city                      50000 non-null object
last_trip_date            50000 non-null datetime64[ns]
phone                     49604 non-null object
signup_date               50000 non-null datetime64[ns]
surge_pct                 50000 non-null float64
trips_in_first_30_days    50000 non-null int64
luxury_car_user           50000 non-null bool
weekday_pct               50000 non-null float64
dtypes: bool(1), datetime64[ns](2), float64(6), int64(1), object(2)
memory usage: 4.2+ MB


In [7]:
churn_data.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct
0,3.67,5.0,4.7,1.1,King's Landing,2014-06-17,iPhone,2014-01-25,15.4,4,True,46.2
1,8.26,5.0,5.0,1.0,Astapor,2014-05-05,Android,2014-01-29,0.0,0,False,50.0
2,0.77,5.0,4.3,1.0,Astapor,2014-01-07,iPhone,2014-01-06,0.0,3,False,100.0
3,2.36,4.9,4.6,1.14,King's Landing,2014-06-29,iPhone,2014-01-10,20.0,9,True,80.0
4,3.13,4.9,4.4,1.19,Winterfell,2014-03-15,Android,2014-01-27,11.8,14,False,82.4


In [8]:
churn_data.last_trip_date.max()

Timestamp('2014-07-01 00:00:00')

In [9]:
churn_data.last_trip_date.min()

Timestamp('2014-01-01 00:00:00')

In [10]:
import datetime
cutoff = churn_data.last_trip_date.max() - datetime.timedelta(30,0,0)

In [11]:
cutoff

Timestamp('2014-06-01 00:00:00')

In [14]:
churn_data['churn'] = churn_data.last_trip_date.map(lambda d: 1 if d > cutoff else 0)

In [15]:
churn_data.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct,churn
0,3.67,5.0,4.7,1.1,King's Landing,2014-06-17,iPhone,2014-01-25,15.4,4,True,46.2,1
1,8.26,5.0,5.0,1.0,Astapor,2014-05-05,Android,2014-01-29,0.0,0,False,50.0,0
2,0.77,5.0,4.3,1.0,Astapor,2014-01-07,iPhone,2014-01-06,0.0,3,False,100.0,0
3,2.36,4.9,4.6,1.14,King's Landing,2014-06-29,iPhone,2014-01-10,20.0,9,True,80.0,1
4,3.13,4.9,4.4,1.19,Winterfell,2014-03-15,Android,2014-01-27,11.8,14,False,82.4,0


In [20]:
cat_cols = list(churn_data.select_dtypes('object').columns)

In [21]:
cat_cols

['city', 'phone']

In [22]:
num_cols = list(churn_data.select_dtypes('float64').columns)

In [24]:
num_cols.append('trips_in_first_30_days')

In [25]:
num_cols

['avg_dist',
 'avg_rating_by_driver',
 'avg_rating_of_driver',
 'avg_surge',
 'surge_pct',
 'weekday_pct',
 'trips_in_first_30_days']

#### Pipeline Creation for categorical columns & numerical columns

In [26]:
from sklearn.pipeline import make_pipeline

In [32]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [29]:
cat_pipeline = make_pipeline(SimpleImputer(strategy='constant', fill_value='missing'),
                            OneHotEncoder())

In [35]:
cat_data_tf = cat_pipeline.fit_transform(churn_data[cat_cols]).toarray()

In [33]:
num_pipeline = make_pipeline(SimpleImputer(strategy='median'),
                             MinMaxScaler())                            

In [36]:
num_data_tf = num_pipeline.fit_transform(churn_data[num_cols])

In [37]:
import numpy as np

In [39]:
feature_data = np.hstack([cat_data_tf, num_data_tf, churn_data.luxury_car_user.reshape(-1,1)])

  """Entry point for launching an IPython kernel.


In [49]:
feature_data.shape

(50000, 14)

In [40]:
target_data = churn_data.churn

In [45]:
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [43]:
final_pipeline = make_pipeline(SelectKBest(k=8), RandomForestClassifier(n_estimators=20))

In [46]:
trainX, testX, trainY, testY = train_test_split(feature_data,target_data)

In [47]:
final_pipeline.fit(trainX,trainY)

Pipeline(memory=None,
     steps=[('selectkbest', SelectKBest(k=8, score_func=<function f_classif at 0x000001F05973DB70>)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decre...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [48]:
final_pipeline.score(testX,testY)

0.66072

In [50]:
from sklearn.model_selection import GridSearchCV

In [51]:
params = {
    'selectkbest__k':[5,8,12],
    'randomforestclassifier__n_estimators':[10,30,50]
}

In [52]:
grid_model = GridSearchCV(final_pipeline,param_grid=params, cv=5, n_jobs=-1)

In [53]:
grid_model.fit(trainX,trainY)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('selectkbest', SelectKBest(k=8, score_func=<function f_classif at 0x000001F05973DB70>)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decre...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'selectkbest__k': [5, 8, 12], 'randomforestclassifier__n_estimators': [10, 30, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [54]:
grid_model.best_params_

{'randomforestclassifier__n_estimators': 50, 'selectkbest__k': 12}

In [55]:
grid_model.best_score_

0.7619733333333333

In [56]:
grid_model.score(testX,testY)

0.7592

In [57]:
from sklearn.compose import ColumnTransformer

In [74]:
num_cols

['avg_dist',
 'avg_rating_by_driver',
 'avg_rating_of_driver',
 'avg_surge',
 'surge_pct',
 'weekday_pct',
 'trips_in_first_30_days']

#### Using column transformer

In [95]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)])

In [96]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[('preprocessor',preprocessor),
                           ('selectkbest', SelectKBest(k=8)),
                           ('classifier',RandomForestClassifier(n_estimators=20))])

In [97]:
#pipeline = make_pipeline(preprocessor, SelectKBest(k=8), RandomForestClassifier(n_estimators=20))

In [98]:
num_pipeline

Pipeline(memory=None,
     steps=[('simpleimputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)), ('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1)))])

In [99]:
params = {
    'preprocessor__num__simpleimputer__strategy':['mean','median'],
    'selectkbest__k':[12,13],
    'classifier__n_estimators':[50,70]
}

In [100]:
grid_model = GridSearchCV(pipeline, param_grid=params,cv=5, n_jobs=-1)

In [101]:
feature_data = churn_data.drop('churn',axis=1)
target_data = churn_data.churn
trainX, testX, trainY, testY = train_test_split(feature_data,target_data)

In [102]:
grid_model.fit(trainX,trainY)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('simpleimputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median',...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'preprocessor__num__simpleimputer__strategy': ['mean', 'median'], 'selectkbest__k': [12, 13], 'classifier__n_estimators': [50, 70]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [103]:
grid_model.best_score_

0.7528

In [104]:
grid_model.best_params_

{'classifier__n_estimators': 70,
 'preprocessor__num__simpleimputer__strategy': 'mean',
 'selectkbest__k': 12}