### Problem Statement
* About Data - Rental bike usage data
* We wanted to predict churn (customers leaving your product)

<hr>

In [1]:
import pandas as pd

In [6]:
churn_data = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/churn.csv.txt', parse_dates=['last_trip_date','signup_date'])

In [7]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
avg_dist                  50000 non-null float64
avg_rating_by_driver      49799 non-null float64
avg_rating_of_driver      41878 non-null float64
avg_surge                 50000 non-null float64
city                      50000 non-null object
last_trip_date            50000 non-null datetime64[ns]
phone                     49604 non-null object
signup_date               50000 non-null datetime64[ns]
surge_pct                 50000 non-null float64
trips_in_first_30_days    50000 non-null int64
luxury_car_user           50000 non-null bool
weekday_pct               50000 non-null float64
dtypes: bool(1), datetime64[ns](2), float64(6), int64(1), object(2)
memory usage: 4.2+ MB


### Is there any churn column in data?
* Many times, target column is directly not available.
* It has to be derived from feature columns
* From the data, we need to identify the date on which the data was downloaded

In [9]:
churn_data.last_trip_date.max()

Timestamp('2014-07-01 00:00:00')

In [10]:
last_date = churn_data.last_trip_date.max()

In [11]:
import datetime

In [15]:
cutoff_date = last_date - datetime.timedelta(30)

* If an user didn't come after cutoff date he/she is considered as churn

In [16]:
churn_data['churn'] = churn_data.last_trip_date.map(lambda d: 'Not Churn' if d > cutoff_date else 'Churn')

In [17]:
churn_data.sample(5)

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct,churn
34568,3.12,5.0,,1.0,Astapor,2014-05-18,iPhone,2014-01-26,0.0,1,False,20.0,Churn
35933,19.36,5.0,,1.0,King's Landing,2014-06-22,Android,2014-01-25,0.0,1,False,0.0,Not Churn
13357,1.91,4.6,5.0,1.05,Winterfell,2014-06-23,iPhone,2014-01-10,20.0,0,False,60.0,Not Churn
42151,3.47,5.0,5.0,1.0,Astapor,2014-02-01,iPhone,2014-01-30,0.0,2,True,50.0,Churn
1728,3.53,5.0,4.7,1.2,Astapor,2014-05-17,Android,2014-01-15,20.0,1,False,80.0,Churn


In [18]:
churn_data.dtypes

avg_dist                         float64
avg_rating_by_driver             float64
avg_rating_of_driver             float64
avg_surge                        float64
city                              object
last_trip_date            datetime64[ns]
phone                             object
signup_date               datetime64[ns]
surge_pct                        float64
trips_in_first_30_days             int64
luxury_car_user                     bool
weekday_pct                      float64
churn                             object
dtype: object

In [19]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 13 columns):
avg_dist                  50000 non-null float64
avg_rating_by_driver      49799 non-null float64
avg_rating_of_driver      41878 non-null float64
avg_surge                 50000 non-null float64
city                      50000 non-null object
last_trip_date            50000 non-null datetime64[ns]
phone                     49604 non-null object
signup_date               50000 non-null datetime64[ns]
surge_pct                 50000 non-null float64
trips_in_first_30_days    50000 non-null int64
luxury_car_user           50000 non-null bool
weekday_pct               50000 non-null float64
churn                     50000 non-null object
dtypes: bool(1), datetime64[ns](2), float64(6), int64(1), object(3)
memory usage: 4.6+ MB


In [58]:
float_churn_data = churn_data.select_dtypes(include=['float','bool']) 

In [59]:
float_churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
avg_dist                50000 non-null float64
avg_rating_by_driver    49799 non-null float64
avg_rating_of_driver    41878 non-null float64
avg_surge               50000 non-null float64
surge_pct               50000 non-null float64
luxury_car_user         50000 non-null bool
weekday_pct             50000 non-null float64
dtypes: bool(1), float64(6)
memory usage: 2.3 MB


In [24]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

In [27]:
float_pipeline = make_pipeline(SimpleImputer(strategy='median'),MinMaxScaler())

In [28]:
cat_churn_data = churn_data[['city','phone']]

In [29]:
cat_churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
city     50000 non-null object
phone    49604 non-null object
dtypes: object(2)
memory usage: 781.4+ KB


In [30]:
from sklearn.preprocessing import OrdinalEncoder

In [32]:
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OrdinalEncoder(), )

In [34]:
churn_data['subscription_days'] = churn_data.last_trip_date - churn_data.signup_date

* timedelta to days conversion

In [42]:
churn_data['subscription_days'] = churn_data.subscription_days.dt.days

In [43]:
int_churn_data = churn_data.select_dtypes(include=['int'])

In [46]:
int_churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
trips_in_first_30_days    50000 non-null int64
subscription_days         50000 non-null int64
dtypes: int64(2)
memory usage: 781.4 KB


In [47]:
int_pipeline = make_pipeline(MinMaxScaler())

In [48]:
from sklearn.compose import make_column_transformer

In [67]:
preprocessor = make_column_transformer(
    (int_pipeline, int_churn_data.columns),
    (cat_pipeline, cat_churn_data.columns),
    (float_pipeline, float_churn_data.columns),
)

In [68]:
churn_data.churn.value_counts()

Churn        31690
Not Churn    18310
Name: churn, dtype: int64

In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [70]:
pipeline = make_pipeline(preprocessor,RandomForestClassifier())

In [71]:
from sklearn.model_selection import train_test_split

In [72]:
trainX, testX, trainY, testY = train_test_split(churn_data.drop(columns=['churn']), churn_data.churn)

In [73]:
pipeline.fit(trainX,trainY)



Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('minmaxscaler',
                                                                   MinMaxScaler(copy=True,
                                                                                feature_range=(0,
                                                                                               1)))],
                                                           verbose=False),
                                                  Index(['trips_in_first_30_days', 'subscription_days'], dtype='object')),
                                                

In [74]:
pipeline.score(testX, testY)

0.9544

In [80]:
from sklearn.metrics import f1_score,confusion_matrix

In [77]:
y_pred = pipeline.predict(testX)

In [79]:
y_pred

array(['Not Churn', 'Not Churn', 'Churn', ..., 'Not Churn', 'Not Churn',
       'Churn'], dtype=object)

In [81]:
confusion_matrix(y_pred=y_pred, y_true=testY)

array([[7611,  263],
       [ 307, 4319]])

In [83]:
from sklearn.model_selection import GridSearchCV

In [93]:
gs = GridSearchCV(pipeline, param_grid={'randomforestclassifier__n_estimators':[100,300]},cv=5, n_jobs=4)

In [94]:
gs.fit(trainX,trainY)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('minmaxscaler',
                                                                                          MinMaxScaler(copy=True,
                                                                                                       feature_range=(0,
        

In [95]:
gs.best_score_

0.9592266666666667

In [96]:
gs.best_params_

{'randomforestclassifier__n_estimators': 100}

In [97]:
pipeline = make_pipeline(preprocessor,GaussianNB())

In [98]:
pipeline.fit(trainX, trainY)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('minmaxscaler',
                                                                   MinMaxScaler(copy=True,
                                                                                feature_range=(0,
                                                                                               1)))],
                                                           verbose=False),
                                                  Index(['trips_in_first_30_days', 'subscription_days'], dtype='object')),
                                                

In [99]:
pipeline.score(testX,testY)

0.93552

In [100]:
for name,estimator in zip( ['RandomForest','GaussianNB', 'LogisticRegression'],[RandomForestClassifier(n_estimators=100), GaussianNB(), LogisticRegression()]):
    pipeline = make_pipeline(preprocessor,estimator)
    pipeline.fit(trainX,trainY)
    print(name, pipeline.score(testX,testY))

RandomForest 0.9592
GaussianNB 0.93552




LogisticRegression 0.96024


### Furthur Fine Tuning
* Feature Selection
* Balancing Data
* More Hyper-parameter Tuning
* Consider month column from date-time cols

In [101]:
help(SimpleImputer)

Help on class SimpleImputer in module sklearn.impute._base:

class SimpleImputer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin)
 |  SimpleImputer(missing_values=nan, strategy='mean', fill_value=None, verbose=0, copy=True, add_indicator=False)
 |  
 |  Imputation transformer for completing missing values.
 |  
 |  Read more in the :ref:`User Guide <impute>`.
 |  
 |  Parameters
 |  ----------
 |  missing_values : number, string, np.nan (default) or None
 |      The placeholder for the missing values. All occurrences of
 |      `missing_values` will be imputed.
 |  
 |  strategy : string, optional (default="mean")
 |      The imputation strategy.
 |  
 |      - If "mean", then replace missing values using the mean along
 |        each column. Can only be used with numeric data.
 |      - If "median", then replace missing values using the median along
 |        each column. Can only be used with numeric data.
 |      - If "most_frequent", then replace missing using the most fr