In [151]:
import pandas as pd
import matplotlib as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [152]:
users = pd.read_json('../ultimate_data_challenge.json', orient='values')

In [153]:
users.head()

Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver
0,King's Landing,4,2014-01-25,4.7,1.1,2014-06-17,iPhone,15.4,True,46.2,3.67,5.0
1,Astapor,0,2014-01-29,5.0,1.0,2014-05-05,Android,0.0,False,50.0,8.26,5.0
2,Astapor,3,2014-01-06,4.3,1.0,2014-01-07,iPhone,0.0,False,100.0,0.77,5.0
3,King's Landing,9,2014-01-10,4.6,1.14,2014-06-29,iPhone,20.0,True,80.0,2.36,4.9
4,Winterfell,14,2014-01-27,4.4,1.19,2014-03-15,Android,11.8,False,82.4,3.13,4.9


In [154]:
users.describe()

Unnamed: 0,trips_in_first_30_days,avg_rating_of_driver,avg_surge,surge_pct,weekday_pct,avg_dist,avg_rating_by_driver
count,50000.0,41878.0,50000.0,50000.0,50000.0,50000.0,49799.0
mean,2.2782,4.601559,1.074764,8.849536,60.926084,5.796827,4.778158
std,3.792684,0.617338,0.222336,19.958811,37.081503,5.707357,0.446652
min,0.0,1.0,1.0,0.0,0.0,0.0,1.0
25%,0.0,4.3,1.0,0.0,33.3,2.42,4.7
50%,1.0,4.9,1.0,0.0,66.7,3.88,5.0
75%,3.0,5.0,1.05,8.6,100.0,6.94,5.0
max,125.0,5.0,8.0,100.0,100.0,160.96,5.0


In [155]:
users.signup_date.value_counts()

2014-01-18    2948
2014-01-25    2885
2014-01-11    2402
2014-01-24    2284
2014-01-17    2149
2014-01-31    2100
2014-01-19    2028
2014-01-10    2021
2014-01-06    1763
2014-01-01    1737
2014-01-26    1708
2014-01-23    1606
2014-01-07    1486
2014-01-04    1485
2014-01-30    1471
2014-01-09    1433
2014-01-16    1431
2014-01-22    1369
2014-01-05    1343
2014-01-12    1334
2014-01-20    1295
2014-01-28    1284
2014-01-08    1275
2014-01-27    1236
2014-01-21    1234
2014-01-03    1213
2014-01-29    1197
2014-01-14    1120
2014-01-15    1110
2014-01-13    1049
2014-01-02    1004
Name: signup_date, dtype: int64

In [156]:
users.last_trip_date.value_counts()

2014-06-29    2036
2014-06-28    1679
2014-06-30    1408
2014-06-27    1120
2014-06-22    1024
              ... 
2014-02-26      79
2014-02-19      79
2014-02-20      74
2014-03-04      69
2014-03-10      69
Name: last_trip_date, Length: 182, dtype: int64

In [157]:
users.loc[pd.to_datetime(users['last_trip_date']).dt.month == 6, 'active_user'] = True
users.loc[pd.to_datetime(users['last_trip_date']).dt.month != 6, 'active_user'] = False

In [158]:
users.active_user.value_counts()

False    31744
True     18256
Name: active_user, dtype: int64

In [159]:
users.head()

Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver,active_user
0,King's Landing,4,2014-01-25,4.7,1.1,2014-06-17,iPhone,15.4,True,46.2,3.67,5.0,True
1,Astapor,0,2014-01-29,5.0,1.0,2014-05-05,Android,0.0,False,50.0,8.26,5.0,False
2,Astapor,3,2014-01-06,4.3,1.0,2014-01-07,iPhone,0.0,False,100.0,0.77,5.0,False
3,King's Landing,9,2014-01-10,4.6,1.14,2014-06-29,iPhone,20.0,True,80.0,2.36,4.9,True
4,Winterfell,14,2014-01-27,4.4,1.19,2014-03-15,Android,11.8,False,82.4,3.13,4.9,False


In [160]:
features = users

In [161]:
labels = features['active_user']

In [162]:
features.drop('active_user', axis=1, inplace=True)

In [163]:
city_one_hot = pd.get_dummies(features.city, prefix='city')

In [164]:
for city in city_one_hot:
    features[city] = city_one_hot[city]

features.drop('city', axis=1, inplace=True)

In [165]:
features.columns

Index(['trips_in_first_30_days', 'signup_date', 'avg_rating_of_driver',
       'avg_surge', 'last_trip_date', 'phone', 'surge_pct',
       'ultimate_black_user', 'weekday_pct', 'avg_dist',
       'avg_rating_by_driver', 'city_Astapor', 'city_King's Landing',
       'city_Winterfell'],
      dtype='object')

In [166]:
phone_one_hot = pd.get_dummies(features.phone, prefix='phone')

In [167]:
for phone in phone_one_hot:
    features[phone] = phone_one_hot[phone]

features.drop('phone', axis=1, inplace=True)

In [168]:
features.head()

Unnamed: 0,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver,city_Astapor,city_King's Landing,city_Winterfell,phone_Android,phone_iPhone
0,4,2014-01-25,4.7,1.1,2014-06-17,15.4,True,46.2,3.67,5.0,0,1,0,0,1
1,0,2014-01-29,5.0,1.0,2014-05-05,0.0,False,50.0,8.26,5.0,1,0,0,1,0
2,3,2014-01-06,4.3,1.0,2014-01-07,0.0,False,100.0,0.77,5.0,1,0,0,0,1
3,9,2014-01-10,4.6,1.14,2014-06-29,20.0,True,80.0,2.36,4.9,0,1,0,0,1
4,14,2014-01-27,4.4,1.19,2014-03-15,11.8,False,82.4,3.13,4.9,0,0,1,1,0


In [169]:
features['signup_year'] = pd.to_datetime(features['signup_date']).dt.year
features['signup_month'] = pd.to_datetime(features['signup_date']).dt.month
features['signup_day'] = pd.to_datetime(features['signup_date']).dt.day

features['last_trip_year'] = pd.to_datetime(features['last_trip_date']).dt.year
features['last_trip_month'] = pd.to_datetime(features['last_trip_date']).dt.month
features['last_trip_day'] = pd.to_datetime(features['last_trip_date']).dt.day

In [170]:
features.drop('signup_date', axis=1, inplace=True)
features.drop('last_trip_date', axis=1, inplace=True)

In [171]:
features.loc[features['avg_rating_of_driver'].isna() == True, 'avg_rating_of_driver'] = features['avg_rating_of_driver'].mean()
features.loc[features['avg_rating_by_driver'].isna() == True, 'avg_rating_by_driver'] = features['avg_rating_by_driver'].mean()

In [172]:
for column in features.columns:
    print(features[column].isna().value_counts())

False    50000
Name: trips_in_first_30_days, dtype: int64
False    50000
Name: avg_rating_of_driver, dtype: int64
False    50000
Name: avg_surge, dtype: int64
False    50000
Name: surge_pct, dtype: int64
False    50000
Name: ultimate_black_user, dtype: int64
False    50000
Name: weekday_pct, dtype: int64
False    50000
Name: avg_dist, dtype: int64
False    50000
Name: avg_rating_by_driver, dtype: int64
False    50000
Name: city_Astapor, dtype: int64
False    50000
Name: city_King's Landing, dtype: int64
False    50000
Name: city_Winterfell, dtype: int64
False    50000
Name: phone_Android, dtype: int64
False    50000
Name: phone_iPhone, dtype: int64
False    50000
Name: signup_year, dtype: int64
False    50000
Name: signup_month, dtype: int64
False    50000
Name: signup_day, dtype: int64
False    50000
Name: last_trip_year, dtype: int64
False    50000
Name: last_trip_month, dtype: int64
False    50000
Name: last_trip_day, dtype: int64


In [173]:
labels=labels.astype('int')

In [174]:
features.drop('last_trip_year', axis=1, inplace=True)
features.drop('last_trip_month', axis=1, inplace=True)
features.drop('last_trip_day', axis=1, inplace=True)

In [175]:
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.33, random_state=42)

In [176]:
# Create the model with 100 trees
bin_model = RandomForestClassifier(n_estimators=100,
                                   oob_score=True,
                                   n_jobs=-1,
                                   random_state=50,
                                   max_features="auto",
                                   verbose=True)

In [177]:
# Fit on training data
bin_model.fit(X_train, y_train)
y_pred = bin_model.predict(X_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


In [178]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[8790 1676]
 [2255 3779]]
              precision    recall  f1-score   support

           0       0.80      0.84      0.82     10466
           1       0.69      0.63      0.66      6034

    accuracy                           0.76     16500
   macro avg       0.74      0.73      0.74     16500
weighted avg       0.76      0.76      0.76     16500

0.7617575757575757


In [179]:
features = features.columns
importances = bin_model.feature_importances_

In [180]:
for i in range(0, len(features)):
    print(features[i] + ': ' + str(importances[i]))

trips_in_first_30_days: 0.07360690168998715
avg_rating_of_driver: 0.07513542629115895
avg_surge: 0.055339202966817595
surge_pct: 0.07966985586129775
ultimate_black_user: 0.031001387906955856
weekday_pct: 0.12050511049590405
avg_dist: 0.21002695474913585
avg_rating_by_driver: 0.10538092141653654
city_Astapor: 0.01468418250420218
city_King's Landing: 0.047764232620745466
city_Winterfell: 0.009647884301374837
phone_Android: 0.019885894604191434
phone_iPhone: 0.01825876434965492
signup_year: 0.0
signup_month: 0.0
signup_day: 0.1390932802420375
