In [1]:
from sklearn_pandas import DataFrameMapper

In [2]:
import pandas as pd

In [6]:
churn_data = pd.read_csv('data/churn.csv.txt', 
parse_dates=['last_trip_date','signup_date'])

In [7]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
avg_dist                  50000 non-null float64
avg_rating_by_driver      49799 non-null float64
avg_rating_of_driver      41878 non-null float64
avg_surge                 50000 non-null float64
city                      50000 non-null object
last_trip_date            50000 non-null datetime64[ns]
phone                     49604 non-null object
signup_date               50000 non-null datetime64[ns]
surge_pct                 50000 non-null float64
trips_in_first_30_days    50000 non-null int64
luxury_car_user           50000 non-null bool
weekday_pct               50000 non-null float64
dtypes: bool(1), datetime64[ns](2), float64(6), int64(1), object(2)
memory usage: 4.2+ MB


In [8]:
churn_data.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct
0,3.67,5.0,4.7,1.1,King's Landing,2014-06-17,iPhone,2014-01-25,15.4,4,True,46.2
1,8.26,5.0,5.0,1.0,Astapor,2014-05-05,Android,2014-01-29,0.0,0,False,50.0
2,0.77,5.0,4.3,1.0,Astapor,2014-01-07,iPhone,2014-01-06,0.0,3,False,100.0
3,2.36,4.9,4.6,1.14,King's Landing,2014-06-29,iPhone,2014-01-10,20.0,9,True,80.0
4,3.13,4.9,4.4,1.19,Winterfell,2014-03-15,Android,2014-01-27,11.8,14,False,82.4


In [13]:
import datetime
date_cutoff = churn_data.last_trip_date.max() - datetime.timedelta(30,0,0)

In [14]:
date_cutoff

Timestamp('2014-06-01 00:00:00')

In [17]:
churn_data['churn'] = (churn_data.last_trip_date < date_cutoff).astype(int)

In [18]:
churn_data.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct,churn
0,3.67,5.0,4.7,1.1,King's Landing,2014-06-17,iPhone,2014-01-25,15.4,4,True,46.2,0
1,8.26,5.0,5.0,1.0,Astapor,2014-05-05,Android,2014-01-29,0.0,0,False,50.0,1
2,0.77,5.0,4.3,1.0,Astapor,2014-01-07,iPhone,2014-01-06,0.0,3,False,100.0,1
3,2.36,4.9,4.6,1.14,King's Landing,2014-06-29,iPhone,2014-01-10,20.0,9,True,80.0,0
4,3.13,4.9,4.4,1.19,Winterfell,2014-03-15,Android,2014-01-27,11.8,14,False,82.4,1


In [19]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 13 columns):
avg_dist                  50000 non-null float64
avg_rating_by_driver      49799 non-null float64
avg_rating_of_driver      41878 non-null float64
avg_surge                 50000 non-null float64
city                      50000 non-null object
last_trip_date            50000 non-null datetime64[ns]
phone                     49604 non-null object
signup_date               50000 non-null datetime64[ns]
surge_pct                 50000 non-null float64
trips_in_first_30_days    50000 non-null int64
luxury_car_user           50000 non-null bool
weekday_pct               50000 non-null float64
churn                     50000 non-null int32
dtypes: bool(1), datetime64[ns](2), float64(6), int32(1), int64(1), object(2)
memory usage: 4.4+ MB


In [23]:
churn_data = churn_data.dropna()

In [24]:
from sklearn.preprocessing import LabelEncoder

In [65]:
mapper = DataFrameMapper([
    ('avg_dist',None),
    ('weekday_pct',None),
    ('city',LabelEncoder()),
    ('phone',LabelEncoder()),
    ('luxury_car_user',LabelEncoder())
])

In [58]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest

In [59]:
lr_pipe = Pipeline([
    ('featurize',mapper),
    ('selector',SelectKBest(k=4)),
    ('scale',StandardScaler()),
    ('clf', LogisticRegression())
])

In [60]:
X = churn_data.drop(['churn'],axis=1)

In [61]:
y = churn_data.churn

In [62]:
from sklearn.model_selection import train_test_split

In [63]:
trainX,testX,trainY,testY = train_test_split(X,y)

In [64]:
lr_pipe.fit(trainX,trainY)

Pipeline(memory=None,
     steps=[('featurize', DataFrameMapper(default=False, df_out=False,
        features=[('avg_dist', None), ('weekday_pct', None), ('city', LabelEncoder()), ('phone', LabelEncoder()), ('luxury_car_user', LabelEncoder())],
        input_df=False, sparse=False)), ('selector', SelectKBest(k=4, score_func=<...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [66]:
class Classifiers(object):
    
    def __init__(self,classifier_list):
        self.classifiers = classifier_list
        self.pipelines = []
        
    def create_pipelines(self,mapper):
        for classifier in self.classifiers:
            self.pipelines.append(Pipeline([
                ('featureize',mapper),
                ('scale',StandardScaler()),
                ('clf', classifier)
            ]))
    
    def train(self, X_train, y_train):
        for pipeline in self.pipelines:
            pipeline.fit(X_train,y_train)
            
    def find_accuracy_score(self, X_test, y_test):
        self.accuracies = []
        
        for pipeline in self.pipelines:
            self.accuracies.append(pipeline.score(X_test,y_test))

In [67]:

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [69]:
lr = LogisticRegression()
rf = RandomForestClassifier(n_estimators=5)
gb = GradientBoostingClassifier()

In [70]:
clf = Classifiers([lr,rf,gb])

In [72]:
clf.create_pipelines(mapper)

In [73]:
clf.train(trainX,trainY)

In [75]:
clf.find_accuracy_score(testX,testY)

In [76]:
clf.accuracies

[0.66483304381393549, 0.69899633275429451, 0.76066396448562057]