In [1]:
import pandas as pd

In [8]:
churn_data = pd.read_csv('data/churn.csv.txt',parse_dates=['last_trip_date','signup_date'])

In [9]:
churn_data.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct
0,3.67,5.0,4.7,1.1,King's Landing,2014-06-17,iPhone,2014-01-25,15.4,4,True,46.2
1,8.26,5.0,5.0,1.0,Astapor,2014-05-05,Android,2014-01-29,0.0,0,False,50.0
2,0.77,5.0,4.3,1.0,Astapor,2014-01-07,iPhone,2014-01-06,0.0,3,False,100.0
3,2.36,4.9,4.6,1.14,King's Landing,2014-06-29,iPhone,2014-01-10,20.0,9,True,80.0
4,3.13,4.9,4.4,1.19,Winterfell,2014-03-15,Android,2014-01-27,11.8,14,False,82.4


In [10]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
avg_dist                  50000 non-null float64
avg_rating_by_driver      49799 non-null float64
avg_rating_of_driver      41878 non-null float64
avg_surge                 50000 non-null float64
city                      50000 non-null object
last_trip_date            50000 non-null datetime64[ns]
phone                     49604 non-null object
signup_date               50000 non-null datetime64[ns]
surge_pct                 50000 non-null float64
trips_in_first_30_days    50000 non-null int64
luxury_car_user           50000 non-null bool
weekday_pct               50000 non-null float64
dtypes: bool(1), datetime64[ns](2), float64(6), int64(1), object(2)
memory usage: 4.2+ MB


### Find cutoff date

In [12]:
import datetime
cutoff = churn_data.last_trip_date.max() - datetime.timedelta(30,0,0) 

In [13]:
cutoff

Timestamp('2014-06-01 00:00:00')

In [15]:
churn_data['churn'] = (churn_data.last_trip_date < cutoff).astype(int)

In [16]:
churn_data.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct,churn
0,3.67,5.0,4.7,1.1,King's Landing,2014-06-17,iPhone,2014-01-25,15.4,4,True,46.2,0
1,8.26,5.0,5.0,1.0,Astapor,2014-05-05,Android,2014-01-29,0.0,0,False,50.0,1
2,0.77,5.0,4.3,1.0,Astapor,2014-01-07,iPhone,2014-01-06,0.0,3,False,100.0,1
3,2.36,4.9,4.6,1.14,King's Landing,2014-06-29,iPhone,2014-01-10,20.0,9,True,80.0,0
4,3.13,4.9,4.4,1.19,Winterfell,2014-03-15,Android,2014-01-27,11.8,14,False,82.4,1


In [17]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 13 columns):
avg_dist                  50000 non-null float64
avg_rating_by_driver      49799 non-null float64
avg_rating_of_driver      41878 non-null float64
avg_surge                 50000 non-null float64
city                      50000 non-null object
last_trip_date            50000 non-null datetime64[ns]
phone                     49604 non-null object
signup_date               50000 non-null datetime64[ns]
surge_pct                 50000 non-null float64
trips_in_first_30_days    50000 non-null int64
luxury_car_user           50000 non-null bool
weekday_pct               50000 non-null float64
churn                     50000 non-null int32
dtypes: bool(1), datetime64[ns](2), float64(6), int32(1), int64(1), object(2)
memory usage: 4.4+ MB


In [18]:
feature_data = churn_data.drop('churn',axis=1)

In [19]:
target_data = churn_data.churn

In [21]:
feature_data.columns

Index(['avg_dist', 'avg_rating_by_driver', 'avg_rating_of_driver', 'avg_surge',
       'city', 'last_trip_date', 'phone', 'signup_date', 'surge_pct',
       'trips_in_first_30_days', 'luxury_car_user', 'weekday_pct'],
      dtype='object')

In [23]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler,LabelEncoder

In [26]:
mapper = DataFrameMapper([
    (['avg_dist','avg_rating_by_driver', 'avg_rating_of_driver', 'avg_surge',
     'surge_pct', 'trips_in_first_30_days','weekday_pct'], StandardScaler()),
    ('city',LabelEncoder()),
    ('phone',LabelEncoder()), 
    ('luxury_car_user', LabelEncoder())   
])

In [27]:
mapper

DataFrameMapper(default=False, df_out=False,
        features=[(['avg_dist', 'avg_rating_by_driver', 'avg_rating_of_driver', 'avg_surge', 'surge_pct', 'trips_in_first_30_days', 'weekday_pct'], StandardScaler(copy=True, with_mean=True, with_std=True)), ('city', LabelEncoder()), ('phone', LabelEncoder()), ('luxury_car_user', LabelEncoder())],
        input_df=False, sparse=False)

In [29]:
churn_data  = churn_data.dropna()

In [31]:
mapper.fit_transform(churn_data).shape

(41445, 10)

### Pipeline Creation

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier

In [70]:
pipe = Pipeline([
    ('mapper',mapper),
    ('selector',SelectKBest(k=6)),
    ('clf', RandomForestClassifier(n_estimators=20))
])

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
feature_data = churn_data.drop('churn',axis=1)
target_data = churn_data.churn
trainX,testX,trainY,testY = train_test_split(feature_data,target_data)

In [50]:
pipe.fit(trainX,trainY)

Pipeline(memory=None,
     steps=[('mapper', DataFrameMapper(default=False, df_out=False,
        features=[(['avg_dist', 'avg_rating_by_driver', 'avg_rating_of_driver', 'avg_surge', 'surge_pct', 'trips_in_first_30_days', 'weekday_pct'], StandardScaler(copy=True, with_mean=True, with_std=True)), ('city', LabelEncoder()), ('ph...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [51]:
pipe.score(testX,testY)

0.73759891912758158

In [63]:
class ChurnClassifiers(object):
    def __init__(self,classifiers_list):
        self.classifiers = classifiers_list
        self.pipelines = []
        
    def create_pipelines(self,mapper):
        for name,clf in self.classifiers:
            self.pipelines.append((name, Pipeline([
                ('mapper',mapper),
                ('clf',clf)
            ])))
            
    def fit(self,trainX,trainY):
        for _,pipe in self.pipelines:
            pipe.fit(trainX,trainY)
            
    def score(self,testX,testY):
        for name,pipe in self.pipelines:
            print (name,pipe.score(testX,testY))

In [64]:
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [65]:
classifiers = [ ('lr',LogisticRegression()), 
               ('rf',RandomForestClassifier(n_estimators=10)), 
               ('ab',AdaBoostClassifier(n_estimators=10)),
               ('gbt',GradientBoostingClassifier(n_estimators=10))] 

In [66]:
models = ChurnClassifiers(classifiers)

In [67]:
models.create_pipelines(mapper)

In [68]:
models.fit(trainX,trainY)

In [69]:
models.score(testX,testY)

lr 0.673808145146
rf 0.730457440649
ab 0.747539085119
gbt 0.726404169079


#### GridSearch on Pipeline

In [71]:
pipe = Pipeline([
    ('mapper',mapper),
    ('selector',SelectKBest()),
    ('clf', RandomForestClassifier())
])

In [72]:
from sklearn.model_selection import GridSearchCV

In [75]:
params = {
    'selector__k':[4,6,8],
    'clf__n_estimators':[10,20]
}

In [76]:
grid = GridSearchCV(pipe,params,cv=5)

In [77]:
grid.fit(trainX,trainY)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('mapper', DataFrameMapper(default=False, df_out=False,
        features=[(['avg_dist', 'avg_rating_by_driver', 'avg_rating_of_driver', 'avg_surge', 'surge_pct', 'trips_in_first_30_days', 'weekday_pct'], StandardScaler(copy=True, with_mean=True, with_std=True)), ('city', LabelEncoder()), ('ph...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__n_estimators': [10, 20], 'selector__k': [4, 6, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [78]:
grid.best_score_

0.73213653765724029

In [79]:
grid.best_params_

{'clf__n_estimators': 20, 'selector__k': 8}