In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from imblearn.over_sampling import SMOTE

%matplotlib inline

In [2]:
df = pd.read_json('ultimate_data_challenge.json')
df.head()

Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver
0,King's Landing,4,2014-01-25,4.7,1.1,2014-06-17,iPhone,15.4,True,46.2,3.67,5.0
1,Astapor,0,2014-01-29,5.0,1.0,2014-05-05,Android,0.0,False,50.0,8.26,5.0
2,Astapor,3,2014-01-06,4.3,1.0,2014-01-07,iPhone,0.0,False,100.0,0.77,5.0
3,King's Landing,9,2014-01-10,4.6,1.14,2014-06-29,iPhone,20.0,True,80.0,2.36,4.9
4,Winterfell,14,2014-01-27,4.4,1.19,2014-03-15,Android,11.8,False,82.4,3.13,4.9


# Feature Engineering

In [3]:
df.dtypes

city                       object
trips_in_first_30_days      int64
signup_date                object
avg_rating_of_driver      float64
avg_surge                 float64
last_trip_date             object
phone                      object
surge_pct                 float64
ultimate_black_user          bool
weekday_pct               float64
avg_dist                  float64
avg_rating_by_driver      float64
dtype: object

In [4]:
df['last_trip_date'] = pd.to_datetime(df['last_trip_date'])
df['signup_date']    = pd.to_datetime(df['signup_date'])

In [5]:
# Counting time difference
df['active_time_days'] = (df['last_trip_date'] - df['signup_date']).dt.days
df.head()

Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver,active_time_days
0,King's Landing,4,2014-01-25,4.7,1.1,2014-06-17,iPhone,15.4,True,46.2,3.67,5.0,143
1,Astapor,0,2014-01-29,5.0,1.0,2014-05-05,Android,0.0,False,50.0,8.26,5.0,96
2,Astapor,3,2014-01-06,4.3,1.0,2014-01-07,iPhone,0.0,False,100.0,0.77,5.0,1
3,King's Landing,9,2014-01-10,4.6,1.14,2014-06-29,iPhone,20.0,True,80.0,2.36,4.9,170
4,Winterfell,14,2014-01-27,4.4,1.19,2014-03-15,Android,11.8,False,82.4,3.13,4.9,47


In [6]:
# Checking NaNs
df.isna().any()

city                      False
trips_in_first_30_days    False
signup_date               False
avg_rating_of_driver       True
avg_surge                 False
last_trip_date            False
phone                      True
surge_pct                 False
ultimate_black_user       False
weekday_pct               False
avg_dist                  False
avg_rating_by_driver       True
active_time_days          False
dtype: bool

In [7]:
# Imputing NaNs
df.loc[df.isna().any(axis=1), 'avg_rating_of_driver'] = 0
df.loc[df.isna().any(axis=1), 'avg_rating_by_driver'] = 0

In [8]:
# One hot encoding
df['iPhone'] = pd.get_dummies(df['phone'])['iPhone']
df.loc[df.ultimate_black_user == True, 'ultimate_black_user'] = 1
df.loc[df.ultimate_black_user == False, 'ultimate_black_user'] = 0
df.head()

Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver,active_time_days,iPhone
0,King's Landing,4,2014-01-25,4.7,1.1,2014-06-17,iPhone,15.4,1,46.2,3.67,5.0,143,1
1,Astapor,0,2014-01-29,5.0,1.0,2014-05-05,Android,0.0,0,50.0,8.26,5.0,96,0
2,Astapor,3,2014-01-06,4.3,1.0,2014-01-07,iPhone,0.0,0,100.0,0.77,5.0,1,1
3,King's Landing,9,2014-01-10,4.6,1.14,2014-06-29,iPhone,20.0,1,80.0,2.36,4.9,170,1
4,Winterfell,14,2014-01-27,4.4,1.19,2014-03-15,Android,11.8,0,82.4,3.13,4.9,47,0


In [9]:
df.city.unique()

array(["King's Landing", 'Astapor', 'Winterfell'], dtype=object)

In [10]:
df_city = pd.get_dummies(df.city, prefix='city')
df = pd.concat([df, df_city[['city_Astapor','city_Winterfell']]], axis=1)
del df_city
df.head()

Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver,active_time_days,iPhone,city_Astapor,city_Winterfell
0,King's Landing,4,2014-01-25,4.7,1.1,2014-06-17,iPhone,15.4,1,46.2,3.67,5.0,143,1,0,0
1,Astapor,0,2014-01-29,5.0,1.0,2014-05-05,Android,0.0,0,50.0,8.26,5.0,96,0,1,0
2,Astapor,3,2014-01-06,4.3,1.0,2014-01-07,iPhone,0.0,0,100.0,0.77,5.0,1,1,1,0
3,King's Landing,9,2014-01-10,4.6,1.14,2014-06-29,iPhone,20.0,1,80.0,2.36,4.9,170,1,0,0
4,Winterfell,14,2014-01-27,4.4,1.19,2014-03-15,Android,11.8,0,82.4,3.13,4.9,47,0,0,1


In [11]:
df['retention_1month'] = 1
df.loc[df.trips_in_first_30_days == 0, 'retention_1month'] = 0
df.head()

Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver,active_time_days,iPhone,city_Astapor,city_Winterfell,retention_1month
0,King's Landing,4,2014-01-25,4.7,1.1,2014-06-17,iPhone,15.4,1,46.2,3.67,5.0,143,1,0,0,1
1,Astapor,0,2014-01-29,5.0,1.0,2014-05-05,Android,0.0,0,50.0,8.26,5.0,96,0,1,0,0
2,Astapor,3,2014-01-06,4.3,1.0,2014-01-07,iPhone,0.0,0,100.0,0.77,5.0,1,1,1,0,1
3,King's Landing,9,2014-01-10,4.6,1.14,2014-06-29,iPhone,20.0,1,80.0,2.36,4.9,170,1,0,0,1
4,Winterfell,14,2014-01-27,4.4,1.19,2014-03-15,Android,11.8,0,82.4,3.13,4.9,47,0,0,1,1


In [12]:
# Fraction of short-term retention
df['retention_1month'].sum()/len(df)

0.6922

In [13]:
df['retention_6month'] = 0
df.loc[df.active_time_days >= 180, 'retention_6month'] = 1
df.head()

Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver,active_time_days,iPhone,city_Astapor,city_Winterfell,retention_1month,retention_6month
0,King's Landing,4,2014-01-25,4.7,1.1,2014-06-17,iPhone,15.4,1,46.2,3.67,5.0,143,1,0,0,1,0
1,Astapor,0,2014-01-29,5.0,1.0,2014-05-05,Android,0.0,0,50.0,8.26,5.0,96,0,1,0,0,0
2,Astapor,3,2014-01-06,4.3,1.0,2014-01-07,iPhone,0.0,0,100.0,0.77,5.0,1,1,1,0,1,0
3,King's Landing,9,2014-01-10,4.6,1.14,2014-06-29,iPhone,20.0,1,80.0,2.36,4.9,170,1,0,0,1,0
4,Winterfell,14,2014-01-27,4.4,1.19,2014-03-15,Android,11.8,0,82.4,3.13,4.9,47,0,0,1,1,0


In [14]:
# Fraction of long-term retention
df['retention_6month'].sum()/len(df)

0.0017

In [15]:
# Drop un-needed columns
df = df.drop(['city', # already in one-hot-encoding
              'phone', # already in one-hot-encoding
              'signup_date', # shouldn't matter
              'last_trip_date', # shouldn't matter
              'retention_1month', # not needed
              'active_time_days'], # not needed
              axis=1)
df.head()

Unnamed: 0,trips_in_first_30_days,avg_rating_of_driver,avg_surge,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver,iPhone,city_Astapor,city_Winterfell,retention_6month
0,4,4.7,1.1,15.4,1,46.2,3.67,5.0,1,0,0,0
1,0,5.0,1.0,0.0,0,50.0,8.26,5.0,0,1,0,0
2,3,4.3,1.0,0.0,0,100.0,0.77,5.0,1,1,0,0
3,9,4.6,1.14,20.0,1,80.0,2.36,4.9,1,0,0,0
4,14,4.4,1.19,11.8,0,82.4,3.13,4.9,0,0,1,0


# Sampling for rebalancing and training

In [16]:
Y = df['retention_6month']
X = df.drop(['retention_6month'], axis=1)
X, Y = SMOTE().fit_resample(X, Y)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=1)

In [18]:
len(y_train[y_train == 1])/len(y_train), len(y_train[y_train == 0])/len(y_train)

(0.5, 0.5)

In [19]:
len(y_test[y_test == 1])/len(y_test), len(y_test[y_test == 0])/len(y_test)

(0.5, 0.5)

# Modeling

In [20]:
rfc = RandomForestClassifier(random_state=1)
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=1)

In [21]:
y_pred = rfc.predict(X_test)
accuracy_score(y_test, y_pred)

0.9977461684864269

In [22]:
importance = rfc.feature_importances_

In [23]:
for i in np.argsort(importance)[::-1]:
    print(X.columns[i], importance[i])

avg_rating_by_driver 0.15375384975966122
surge_pct 0.1480391995404454
avg_surge 0.1409666525314987
avg_rating_of_driver 0.12912646970417532
weekday_pct 0.08906336929711028
city_Astapor 0.08780215518885039
ultimate_black_user 0.0840732892704991
avg_dist 0.05843318933155278
trips_in_first_30_days 0.05083498547706006
city_Winterfell 0.046817616388890364
iPhone 0.011089223510256395


There are two important factors based on feature importance above:

1) Driver happiness (reflected from rating by and for drivers).

2) Finance (reflected from surge average and percentage).

I suggest to do the following to increase driver retention:

1) Giving bonus to drivers (increase financial benefit).

2) Improve algorithm to match drivers and passengers to minimize waiting time (hopefully better rating for driver).

3) Improve GPS system to minimize travel time and confusion on road (hopefully better rating for driver).

4) Supplies support for car cleaning (hopefully better rating for driver).

# Reason for using that model:

1) It is a classification problem, not linear regression.

2) The decision boundary is hardly smooth, hence not using logistic regression.

3) Need to be interpreted for business solution (e.g. using feature importance), so deep learning is not suitable.

4) Tree-based algorithm is best used for this situation.

5) Random-forest is most common tree-based algorithm and simple to implement.

6) The model is valid, given high accuracy above.

Alternative approach is LGBM model.

My concern is highly imbalance data, eventhough I did oversampling above, it would be better if we change criteria for adopted users or adding more data, so that the training would be more balanced.