In [1]:
import pandas as pd

In [2]:
churn_data = pd.read_csv('churn.csv.txt', parse_dates=['last_trip_date','signup_date'])

In [3]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
avg_dist                  50000 non-null float64
avg_rating_by_driver      49799 non-null float64
avg_rating_of_driver      41878 non-null float64
avg_surge                 50000 non-null float64
city                      50000 non-null object
last_trip_date            50000 non-null datetime64[ns]
phone                     49604 non-null object
signup_date               50000 non-null datetime64[ns]
surge_pct                 50000 non-null float64
trips_in_first_30_days    50000 non-null int64
luxury_car_user           50000 non-null bool
weekday_pct               50000 non-null float64
dtypes: bool(1), datetime64[ns](2), float64(6), int64(1), object(2)
memory usage: 4.2+ MB


In [4]:
churn_data.last_trip_date.max()

Timestamp('2014-07-01 00:00:00')

In [5]:
import datetime

In [6]:
cutoff = churn_data.last_trip_date.max() - datetime.timedelta(30,0,0)

In [7]:
cutoff

Timestamp('2014-06-01 00:00:00')

In [8]:
churn_data['churn'] = (churn_data.last_trip_date < cutoff).astype(int)

In [9]:
churn_data.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct,churn
0,3.67,5.0,4.7,1.1,King's Landing,2014-06-17,iPhone,2014-01-25,15.4,4,True,46.2,0
1,8.26,5.0,5.0,1.0,Astapor,2014-05-05,Android,2014-01-29,0.0,0,False,50.0,1
2,0.77,5.0,4.3,1.0,Astapor,2014-01-07,iPhone,2014-01-06,0.0,3,False,100.0,1
3,2.36,4.9,4.6,1.14,King's Landing,2014-06-29,iPhone,2014-01-10,20.0,9,True,80.0,0
4,3.13,4.9,4.4,1.19,Winterfell,2014-03-15,Android,2014-01-27,11.8,14,False,82.4,1


In [10]:
churn_data.dropna(inplace=True)

In [11]:
feature_data = churn_data.drop('churn',axis=1)

In [12]:
target_data = churn_data.churn

In [13]:
feature_data.columns

Index(['avg_dist', 'avg_rating_by_driver', 'avg_rating_of_driver', 'avg_surge',
       'city', 'last_trip_date', 'phone', 'signup_date', 'surge_pct',
       'trips_in_first_30_days', 'luxury_car_user', 'weekday_pct'],
      dtype='object')

In [14]:
feature_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41445 entries, 0 to 49998
Data columns (total 12 columns):
avg_dist                  41445 non-null float64
avg_rating_by_driver      41445 non-null float64
avg_rating_of_driver      41445 non-null float64
avg_surge                 41445 non-null float64
city                      41445 non-null object
last_trip_date            41445 non-null datetime64[ns]
phone                     41445 non-null object
signup_date               41445 non-null datetime64[ns]
surge_pct                 41445 non-null float64
trips_in_first_30_days    41445 non-null int64
luxury_car_user           41445 non-null bool
weekday_pct               41445 non-null float64
dtypes: bool(1), datetime64[ns](2), float64(6), int64(1), object(2)
memory usage: 3.8+ MB


In [15]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [16]:
mapper = DataFrameMapper([
    (['avg_dist','avg_rating_by_driver','avg_rating_of_driver',
      'avg_surge','surge_pct','trips_in_first_30_days','weekday_pct'], StandardScaler()),
    ('city',LabelEncoder()),
    ('luxury_car_user',LabelEncoder())
])

In [17]:
mapper

DataFrameMapper(default=False, df_out=False,
        features=[(['avg_dist', 'avg_rating_by_driver', 'avg_rating_of_driver', 'avg_surge', 'surge_pct', 'trips_in_first_30_days', 'weekday_pct'], StandardScaler(copy=True, with_mean=True, with_std=True)), ('city', LabelEncoder()), ('luxury_car_user', LabelEncoder())],
        input_df=False, sparse=False)

In [18]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41445 entries, 0 to 49998
Data columns (total 13 columns):
avg_dist                  41445 non-null float64
avg_rating_by_driver      41445 non-null float64
avg_rating_of_driver      41445 non-null float64
avg_surge                 41445 non-null float64
city                      41445 non-null object
last_trip_date            41445 non-null datetime64[ns]
phone                     41445 non-null object
signup_date               41445 non-null datetime64[ns]
surge_pct                 41445 non-null float64
trips_in_first_30_days    41445 non-null int64
luxury_car_user           41445 non-null bool
weekday_pct               41445 non-null float64
churn                     41445 non-null int32
dtypes: bool(1), datetime64[ns](2), float64(6), int32(1), int64(1), object(2)
memory usage: 4.0+ MB


In [19]:
mapper.fit_transform(churn_data).shape

(41445, 9)

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest

In [21]:
class ChurnClassifier(object):
    def __init__(self,clsf_list):
        self.clsf = clsf_list
        self.pipelines = []
        
    def create_pipelines(self,mapper,selector):
        for name,est in self.clsf:
            pipeline = Pipeline([
                ('mapper',mapper),
                ('selector',selector),
                ('est',est)
            ])
            self.pipelines.append((name,pipeline))
            
    def fit(self,trainX,trainY):
        for name,pipeline in self.pipelines:
            print ('Training ',name)
            pipeline.fit(trainX,trainY)
        
    def score(self,testX,testY):
        for name,pipeline in self.pipelines:
            s = pipeline.score(testX,testY)
            print ('Mode ',name,' -',s,'%')

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [23]:
clsf = ChurnClassifier([('lr',LogisticRegression()),('dt',DecisionTreeClassifier()),('rf',RandomForestClassifier(n_estimators=10))])

In [24]:
clsf.create_pipelines(mapper,SelectKBest(k=9))

In [25]:
clsf.pipelines[1]

('dt', Pipeline(memory=None,
      steps=[('mapper', DataFrameMapper(default=False, df_out=False,
         features=[(['avg_dist', 'avg_rating_by_driver', 'avg_rating_of_driver', 'avg_surge', 'surge_pct', 'trips_in_first_30_days', 'weekday_pct'], StandardScaler(copy=True, with_mean=True, with_std=True)), ('city', LabelEncoder()), ('lu...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
             splitter='best'))]))

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
trainX,testX, trainY, testY = train_test_split(feature_data, target_data)

In [28]:
clsf.fit(trainX,trainY)

Training  lr
Training  dt
Training  rf


In [29]:
clsf.score(testX,testY)

Mode  lr  - 0.6455317506272921 %
Mode  dt  - 0.6762208067940552 %
Mode  rf  - 0.7199382358618027 %


In [30]:
from sklearn.externals import joblib

In [31]:
joblib.dump(clsf, 'filename.pkl')

['filename.pkl']

In [None]:
trainX.to