In [46]:
import pandas as pd
df=pd.read_csv("churn_train.csv")
df.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct
0,6.94,5.0,5.0,1.0,Astapor,2014-05-03,Android,2014-01-12,0.0,0,False,100.0
1,8.06,5.0,5.0,1.0,Astapor,2014-01-26,Android,2014-01-25,0.0,2,True,0.0
2,21.5,4.0,,1.0,Winterfell,2014-05-21,iPhone,2014-01-02,0.0,1,True,100.0
3,9.46,5.0,,2.75,Winterfell,2014-01-10,Android,2014-01-09,100.0,1,False,100.0
4,13.77,5.0,,1.0,Winterfell,2014-05-13,iPhone,2014-01-31,0.0,0,False,100.0


In [47]:
#Let's store the June 1st in a variable churn_date. Dates in the column 'last_trip_date' that are greater than or equal to churn_date means
#customers were riding the service 30 days prior to July 1st when we actually pulled the data
churn_date=pd.to_datetime(({'year': [2014],'month': [6],'day': [1]}))

In [48]:
#So here 1 means customers have churned and 0 means they haven't
import numpy as np
df['churn'] = df['last_trip_date'].apply(lambda x: x >= churn_date)
df['churn'] = np.where(df['churn'] ==True, '0', '1')
df['churn']=df['churn'].astype(int)

In [49]:
#Let's impute the missing values in the column: avg_rating_of_driver by mode of the column

df['avg_rating_of_driver'].fillna(df['avg_rating_of_driver'].mode()[0], inplace=True)

In [50]:
#Let's impute the missing values in the column: avg_rating_of_driver by mode of the column. That way the values won't skew the
#distribution

df['avg_rating_by_driver'].fillna(df['avg_rating_by_driver'].mode()[0], inplace=True)

In [51]:
#Let's check the column: phone
df['phone'].value_counts(dropna=False)

iPhone     27628
Android    12053
NaN          319
Name: phone, dtype: int64

In [52]:
#Let's impute the null values using Android for now

df['phone'].fillna('Android', inplace=True)

In [53]:
#To convert the numerical data that are actually categorical nature
def labelencoding_categorical_ordinal(x):
    from sklearn.preprocessing import LabelEncoder
    for c in x:
        lbe = LabelEncoder() 
        lbe.fit(list(df[c].values)) 
        df[c] = lbe.transform(list(df[c].values))
labelencoding_categorical_ordinal(['last_trip_date', 'signup_date'])

In [54]:
#Converting the boolean values into 1 and 0

df['luxury_car_user'] = df['luxury_car_user'].apply(lambda x: 0 if x=='False' else 1)

In [55]:
df = pd.get_dummies(df)

In [56]:
from sklearn.externals import joblib
joblib.dump(df,'churn_train_dataset')

['churn_train_dataset']

In [103]:
from sklearn.externals import joblib
df=joblib.load('churn_train_dataset')

In [104]:
#Our target variable is imbalanced
df['churn'].value_counts()

1    24968
0    15032
Name: churn, dtype: int64

In [105]:
len(df)

40000

In [106]:
#Let's balance the imbalanced target variable using random undersampling 

# Class count
count_class_1, count_class_0 = df['churn'].value_counts()

# Divide by class
df_class_0 = df[df['churn'] == 0]
df_class_1 = df[df['churn'] == 1]

In [108]:
#Random under-sampling
df_class_1_under = df_class_1.sample(count_class_0)


df_under=pd.concat([df_class_0,df_class_1_under], ignore_index=True)



In [109]:
df_under.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,last_trip_date,signup_date,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct,churn,city_Astapor,city_King's Landing,city_Winterfell,phone_Android,phone_iPhone
0,5.42,4.7,5.0,1.0,178,4,0.0,2,1,33.3,0,1,0,0,0,1
1,4.28,5.0,4.0,1.15,178,13,20.0,2,1,40.0,0,0,0,1,1,0
2,11.44,4.6,4.7,1.0,159,24,0.0,6,1,31.4,0,1,0,0,0,1
3,2.49,4.9,4.7,1.11,178,8,14.3,2,1,78.6,0,0,1,0,1,0
4,1.45,4.8,3.6,1.2,164,24,28.6,2,1,57.1,0,1,0,0,0,1


In [110]:
len(df_under)

30064

In [111]:
#Splitting X and y values
y=df_under['churn'].values
df_under.drop('churn', inplace=True, axis=1)
X=df_under.values

In [112]:
def classifier_performance(model):
    from sklearn.model_selection import KFold, cross_val_score
    kf = KFold(10, shuffle=True, random_state=42)
#     roc_score=roc_auc_score(y_true, y_scores)
    log_loss_val=cross_val_score(model, X, y, cv=10, scoring='neg_log_loss')
    return(log_loss_val.mean(), model)

In [113]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

In [114]:
def make_pipeline(m):
    from sklearn.pipeline import make_pipeline
    res=[]
    for i in m:
        res.append(make_pipeline(StandardScaler(), i))
    return res

In [115]:
#GaussianNB(), RandomForestClassifier(), AdaBoostClassifier(), LogisticRegression()
for m in make_pipeline(m=[GaussianNB(), RandomForestClassifier(), AdaBoostClassifier(), LogisticRegression()]):
    print(classifier_performance(m))

(-0.2617090548491625, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('gaussiannb', GaussianNB(priors=None, var_smoothing=1e-09))]))




(-0.014341906578200638, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]))
(-9.992007221626413e-16, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('adaboostclassifier', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None))]))




(-0.026048526257822734, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]))


In [116]:
#KNeighborsClassifier(), MLPClassifier(), DecisionTreeClassifier()
for m1 in make_pipeline(m=[KNeighborsClassifier(), MLPClassifier(), DecisionTreeClassifier()]):
    print(classifier_performance(m1))

(-0.5066977449429948, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kneighborsclassifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))]))
(-0.007492608981366627, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('mlpclassifier', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
 ...=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))]))
(-9.992007221626413e-16, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight=None, criteri

In [117]:
#SVC
for m_svc in make_pipeline(m=[SVC(probability=True)]):
    print(classifier_performance(m_svc))



(-0.03138528554676363, Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]))
