In [1]:
import pandas as pd
import numpy as np

In [2]:
tu = pd.read_csv('takehome_users.csv', encoding='latin')
tue = pd.read_csv('takehome_user_engagement.csv')

In [3]:
tu.info()
display(tu.shape)
display(tu.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


(12000, 10)

Unnamed: 0,object_id,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
count,12000.0,8823.0,12000.0,12000.0,12000.0,6417.0
mean,6000.5,1379279000.0,0.2495,0.149333,141.884583,5962.957145
std,3464.24595,19531160.0,0.432742,0.356432,124.056723,3383.761968
min,1.0,1338452000.0,0.0,0.0,0.0,3.0
25%,3000.75,1363195000.0,0.0,0.0,29.0,3058.0
50%,6000.5,1382888000.0,0.0,0.0,108.0,5954.0
75%,9000.25,1398443000.0,0.0,0.0,238.25,8817.0
max,12000.0,1402067000.0,1.0,1.0,416.0,11999.0


We see that all columns are non-null except for last_session_creation_time and invited_by_user_id. We won't fill in those missing values yet.


In [4]:
tu.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [5]:
tu['creation_time'] = pd.to_datetime(tu['creation_time'])

In [6]:
tu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null datetime64[ns]
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(4), object(3)
memory usage: 937.6+ KB


In [7]:
tue.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [8]:
tue.shape

(207917, 3)

In [9]:
tue.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null object
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


No nulls.

In [10]:
tue.describe()

Unnamed: 0,user_id,visited
count,207917.0,207917.0
mean,5913.314197,1.0
std,3394.941674,0.0
min,1.0,1.0
25%,3087.0,1.0
50%,5682.0,1.0
75%,8944.0,1.0
max,12000.0,1.0


Every value in the visited column is 1. user_id seems to cover the range of users.

So the array already is sorted by user_id. 

Since we are looked for users who have logged into the product on three separate days in at least one seven-day period, we will assume that we want the users login times spread out over at least 1 week. That is, the first login time and last login time must be minimum 1 week apart.

In [11]:
tue['time_stamp'] = pd.to_datetime(tue['time_stamp'])
tue.sort_values(by=['user_id', 'time_stamp'], inplace=True)

In [12]:
visit_counts = tue.groupby(by=['user_id']).count()[['visited']]

user_id = tu[['object_id']].copy()
user_id.set_index('object_id', inplace=True)

visit_counts = user_id.join(visit_counts, how='left').fillna(0)

In [13]:
visit_counts['adopted'] = np.nan
visit_counts.loc[visit_counts.visited < 3, 'adopted'] = 0

In [14]:
visit_counts

Unnamed: 0_level_0,visited,adopted
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.0,0.0
2,14.0,
3,1.0,0.0
4,1.0,0.0
5,1.0,0.0
6,1.0,0.0
7,1.0,0.0
8,0.0,0.0
9,0.0,0.0
10,284.0,


In [15]:
three_or_more_ids = visit_counts[visit_counts.adopted.isnull()].index.values.tolist()
three_or_more_rows = tue[tue.user_id.isin(three_or_more_ids)].copy()

def get_date_range(series):
    return (max(series) - min(series))

In [16]:
#three_or_more_rows.groupby('user_id')['time_stamp'].transform(get_date_range) #does it for every row
time_span = three_or_more_rows.groupby('user_id')['time_stamp'].apply(get_date_range) #aggregates for us

In [17]:
time_span.describe()

count                        2248
mean     226 days 04:09:10.889679
std      209 days 01:43:24.737490
min               3 days 00:00:00
25%              51 days 00:00:00
50%             139 days 00:00:00
75%             363 days 00:00:00
max             727 days 00:00:00
Name: time_stamp, dtype: object

We see that the min is 3 days. So some of our users who used the product 3 or more times did so only over 3 days. These users must be excluded from our "adopted users".

In [18]:
users_over_7_days = time_span[time_span >= '7 days'].index.tolist()

In [19]:
import datetime
three_or_more_rows['time_stamp'] = three_or_more_rows['time_stamp'].apply(lambda dt: datetime.datetime(dt.year, dt.month, dt.day))
three_or_more_rows

Unnamed: 0,time_stamp,user_id,visited
1,2013-11-15,2,1
2,2013-11-29,2,1
3,2013-12-09,2,1
4,2013-12-25,2,1
5,2013-12-31,2,1
6,2014-01-08,2,1
7,2014-02-03,2,1
8,2014-02-08,2,1
9,2014-02-09,2,1
10,2014-02-13,2,1


In [20]:
min(three_or_more_rows.groupby('user_id')['time_stamp'].nunique().sort_values())

3

So every user with three or more rows logged in on at least 3 separate days.

So we now know all users who have logged into the product on three separate days in at least one seven-day period. They are the users with ids in users_over_seven_days.

In [21]:
visit_counts.loc[visit_counts.index.isin(users_over_7_days), 'adopted'] = 1

In [22]:
visit_counts[visit_counts.adopted.isnull()]

Unnamed: 0_level_0,visited,adopted
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3059,3.0,
3222,3.0,
3638,3.0,
4093,3.0,
4396,3.0,
5970,3.0,
6981,3.0,
10277,3.0,


In [23]:
time_span[time_span < '7 days'].index.tolist()

[3059, 3222, 3638, 4093, 4396, 5970, 6981, 10277]

Same people. We can fill their 'adopted' column with 0.

In [24]:
visit_counts.loc[visit_counts.index.isin(time_span[time_span < '7 days'].index.tolist()), 'adopted'] = 0

In [25]:
visit_counts.adopted.value_counts()

0.0    9760
1.0    2240
Name: adopted, dtype: int64

In [26]:
visit_counts.head()

Unnamed: 0_level_0,visited,adopted
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.0,0.0
2,14.0,1.0
3,1.0,0.0
4,1.0,0.0
5,1.0,0.0


In [27]:
tu.set_index('object_id', inplace=True)
tu.head()

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [28]:
df = tu.join(visit_counts)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 1 to 12000
Data columns (total 11 columns):
creation_time                 12000 non-null datetime64[ns]
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
visited                       12000 non-null float64
adopted                       12000 non-null float64
dtypes: datetime64[ns](1), float64(4), int64(3), object(3)
memory usage: 1.4+ MB


As mentioned earlier, we see last_session_creating_time and invited_by_user_id contain nulls. last_session_creating time is a unix timestamp of last login, while invited_by_user_id is the id of the user who invited them to join. For this latter column, it doesn't make sense to fill in values with the mean or median of the column. Instead, it's best to just create a category 'missing' and fill the nulls in with it.



In [30]:
df.last_session_creation_time.describe()

count    8.823000e+03
mean     1.379279e+09
std      1.953116e+07
min      1.338452e+09
25%      1.363195e+09
50%      1.382888e+09
75%      1.398443e+09
max      1.402067e+09
Name: last_session_creation_time, dtype: float64

For last_session_creation_time, the meaning for the nulls is that these users never created a session or never logged on. For these users, we will replace the null values with 0 to indicate that the user logged in a very long time ago.

In [31]:
df.loc[df.last_session_creation_time.isnull(), 'last_session_creation_time'] = 0

In [32]:
df.describe()['adopted']

count    12000.000000
mean         0.186667
std          0.389660
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: adopted, dtype: float64

We see that about 19% of the users became adopted users. So our dataset has a roughly 80-20 imbalance.

Let's build a model. We'll use a Random Forest because it generally performs well in classification tasks, and provides good interpretability of how it makes decisions, i.e. it tells us what features are important. This is what we want. 

We'll drop the visited column, because the adopted column, our target column, was partially created from it, so using it would be cheating. We can also drop name and email since these should be unique to every user and even if they aren't, it doesn't make sense to say that your email or name should be a factor in whether or not you are an adopted customer. 

In [33]:
df.drop(labels=['name', 'email', 'visited'], axis=1, inplace=True)

In [34]:
df.head()

Unnamed: 0_level_0,creation_time,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,2014-04-22 03:53:30,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0.0
2,2013-11-15 03:45:04,ORG_INVITE,1396238000.0,0,0,1,316.0,1.0
3,2013-03-19 23:14:52,ORG_INVITE,1363735000.0,0,0,94,1525.0,0.0
4,2013-05-21 08:09:28,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0.0
5,2013-01-17 10:14:20,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0.0


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 1 to 12000
Data columns (total 8 columns):
creation_time                 12000 non-null datetime64[ns]
creation_source               12000 non-null object
last_session_creation_time    12000 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
adopted                       12000 non-null float64
dtypes: datetime64[ns](1), float64(3), int64(3), object(1)
memory usage: 1.1+ MB


In [36]:
df.creation_time.describe()

count                   12000
unique                  11996
top       2014-02-11 17:57:53
freq                        2
first     2012-05-31 00:43:27
last      2014-05-30 23:59:19
Name: creation_time, dtype: object

In [37]:
first_date = min(df.creation_time)

In [38]:
new_creating_time = df.creation_time - first_date

In [39]:
df.creation_time = new_creating_time.dt.total_seconds() #change datetime to seconds since the first time

In [40]:
df.head()

Unnamed: 0_level_0,creation_time,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,59713803.0,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0.0
2,46062097.0,ORG_INVITE,1396238000.0,0,0,1,316.0,1.0
3,25309885.0,ORG_INVITE,1363735000.0,0,0,94,1525.0,0.0
4,30698761.0,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0.0
5,19992653.0,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0.0


In [41]:
df.creation_source.value_counts()

ORG_INVITE            4254
GUEST_INVITE          2163
PERSONAL_PROJECTS     2111
SIGNUP                2087
SIGNUP_GOOGLE_AUTH    1385
Name: creation_source, dtype: int64

Since creation_source is a categorical feature, we need to one-hot-encode it.

In [42]:
df.creation_source[1]

'GUEST_INVITE'

In [43]:
df = pd.get_dummies(df, columns=['creation_source'])

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 1 to 12000
Data columns (total 12 columns):
creation_time                         12000 non-null float64
last_session_creation_time            12000 non-null float64
opted_in_to_mailing_list              12000 non-null int64
enabled_for_marketing_drip            12000 non-null int64
org_id                                12000 non-null int64
invited_by_user_id                    6417 non-null float64
adopted                               12000 non-null float64
creation_source_GUEST_INVITE          12000 non-null uint8
creation_source_ORG_INVITE            12000 non-null uint8
creation_source_PERSONAL_PROJECTS     12000 non-null uint8
creation_source_SIGNUP                12000 non-null uint8
creation_source_SIGNUP_GOOGLE_AUTH    12000 non-null uint8
dtypes: float64(4), int64(3), uint8(5)
memory usage: 1.1 MB


In [45]:
df.head()

Unnamed: 0_level_0,creation_time,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted,creation_source_GUEST_INVITE,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,59713803.0,1398139000.0,1,0,11,10803.0,0.0,1,0,0,0,0
2,46062097.0,1396238000.0,0,0,1,316.0,1.0,0,1,0,0,0
3,25309885.0,1363735000.0,0,0,94,1525.0,0.0,0,1,0,0,0
4,30698761.0,1369210000.0,0,0,1,5151.0,0.0,1,0,0,0,0
5,19992653.0,1358850000.0,0,0,193,5240.0,0.0,1,0,0,0,0


The next column is last_session_creation_time, but we have already dealt with it so we may continue.

In [46]:
df.opted_in_to_mailing_list.describe()

count    12000.000000
mean         0.249500
std          0.432742
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: opted_in_to_mailing_list, dtype: float64

In [47]:
df.enabled_for_marketing_drip.describe()

count    12000.000000
mean         0.149333
std          0.356432
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: enabled_for_marketing_drip, dtype: float64

Both opted_in_to_mailing_list and enabled_for_marketing_drip are binary columns with no NaN values, so we may continue.

In [48]:
df.org_id.describe()

count    12000.000000
mean       141.884583
std        124.056723
min          0.000000
25%         29.000000
50%        108.000000
75%        238.250000
max        416.000000
Name: org_id, dtype: float64

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 1 to 12000
Data columns (total 12 columns):
creation_time                         12000 non-null float64
last_session_creation_time            12000 non-null float64
opted_in_to_mailing_list              12000 non-null int64
enabled_for_marketing_drip            12000 non-null int64
org_id                                12000 non-null int64
invited_by_user_id                    6417 non-null float64
adopted                               12000 non-null float64
creation_source_GUEST_INVITE          12000 non-null uint8
creation_source_ORG_INVITE            12000 non-null uint8
creation_source_PERSONAL_PROJECTS     12000 non-null uint8
creation_source_SIGNUP                12000 non-null uint8
creation_source_SIGNUP_GOOGLE_AUTH    12000 non-null uint8
dtypes: float64(4), int64(3), uint8(5)
memory usage: 1.1 MB


Our next column is org_id. This column has no nulls, but is categorical. Normally, we would one-hot-encode it, but it has high-cardinality, so we won't do that here. Our other choices are to drop it, to leave it as is, to to encode it with other information. We can't do the latter because we are limited to this dataset. So we can drop it or leave as is. We choose to leave as is for now.

In [50]:
df.head()

Unnamed: 0_level_0,creation_time,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted,creation_source_GUEST_INVITE,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,59713803.0,1398139000.0,1,0,11,10803.0,0.0,1,0,0,0,0
2,46062097.0,1396238000.0,0,0,1,316.0,1.0,0,1,0,0,0
3,25309885.0,1363735000.0,0,0,94,1525.0,0.0,0,1,0,0,0
4,30698761.0,1369210000.0,0,0,1,5151.0,0.0,1,0,0,0,0
5,19992653.0,1358850000.0,0,0,193,5240.0,0.0,1,0,0,0,0


Our next columns is invited_by_user_id

In [51]:
df.invited_by_user_id.describe()

count     6417.000000
mean      5962.957145
std       3383.761968
min          3.000000
25%       3058.000000
50%       5954.000000
75%       8817.000000
max      11999.000000
Name: invited_by_user_id, dtype: float64

We note again that we have null values here. We'll encode these values are 0, which makes sense since this column is categorical. After doing so, we have the same choice as we did for org_id: dropping the column or leaving it as is. We'll leave it as is as well.

In [52]:
df.loc[df.invited_by_user_id.isnull(), 'invited_by_user_id'] = 0

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 1 to 12000
Data columns (total 12 columns):
creation_time                         12000 non-null float64
last_session_creation_time            12000 non-null float64
opted_in_to_mailing_list              12000 non-null int64
enabled_for_marketing_drip            12000 non-null int64
org_id                                12000 non-null int64
invited_by_user_id                    12000 non-null float64
adopted                               12000 non-null float64
creation_source_GUEST_INVITE          12000 non-null uint8
creation_source_ORG_INVITE            12000 non-null uint8
creation_source_PERSONAL_PROJECTS     12000 non-null uint8
creation_source_SIGNUP                12000 non-null uint8
creation_source_SIGNUP_GOOGLE_AUTH    12000 non-null uint8
dtypes: float64(4), int64(3), uint8(5)
memory usage: 1.1 MB


Great, so we're done preprocessing our columns. Let's fit the model now.

In [54]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score, precision_score, recall_score, precision_recall_curve
import time

##### Steps Involved in Model Fitting in sklearn

1. Preprocess Data
2. Create Train and Test Sets
3. Instantiate the model/estimator  
(Steps 1 and 3 can be combined in a Pipeline object)
4. Specify Hyperparameter Space
5. Instantiate GridSearchCV or RandomizedSearchCV objects
6. Fit CV object to the Training Set
7. Predict on the Test Set
8. Compute Scores for the Model

We've done step 1, so let's do the others.

In [55]:
#### Step 2) Split Data into Training and Test Sets
X = df.drop(['adopted'], axis=1)
y = df['adopted']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

# Step 3: Instantiate the model/estimator
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
rf = RandomForestClassifier(random_state=0)

# Step 4: Specify a Hyperparameter space
param_grid_rf = {
                 'n_estimators': [50, 100],
                 'criterion': ["gini", "entropy"],
                 'max_depth': [3, 10, 20, None],
                 'min_samples_leaf': randint(1, 9),
                 'min_samples_split': randint(2, 9),
                 'max_features': randint(1, 9)
                }

# Step 5: Instantiate CV Object
rf_cv = RandomizedSearchCV(rf, param_grid_rf, cv=10, iid=False)

# Step 6: Fit CV Object to the training set
t0 = time.time()
rf_cv.fit(X_train, y_train)
ttt = time.time() - t0 # ttt stands for "time to train"
print("It takes %.3f" % ttt, " seconds for rf fitting")

# Step 7: Predict on the test set
y_pred = rf_cv.predict(X_test)

# Step 8: Score the model
print("Accuracy with RF on the test set is: %.3f" % rf_cv.score(X_test, y_test))

It takes 52.443  seconds for rf fitting
Accuracy with RF on the test set is: 0.969


Wow, great accuracy on the test set. Let's see what it is on the train set.

In [56]:
rf_cv.score(X_train, y_train)

0.9913095238095239

Even higher. So our features allow us to predict with high accuracy.

In [57]:
rf_cv.best_params_

{'criterion': 'entropy',
 'max_depth': None,
 'max_features': 8,
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'n_estimators': 50}

Let's check out the feature importances now.

In [58]:
pd.DataFrame(rf_cv.best_estimator_.feature_importances_, 
             index = X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

Unnamed: 0,importance
last_session_creation_time,0.629184
creation_time,0.312746
org_id,0.030109
invited_by_user_id,0.017718
creation_source_PERSONAL_PROJECTS,0.002418
creation_source_GUEST_INVITE,0.001688
creation_source_SIGNUP_GOOGLE_AUTH,0.001463
opted_in_to_mailing_list,0.001332
enabled_for_marketing_drip,0.001257
creation_source_SIGNUP,0.001095


Wow, so last_session creation time is the most important feature by far. Remember that this feature represents the unix timestamp of last login. Also remember that we had nulls in this column that we replaced with 0. It makes sense that this column would be an important one because all the null values represented users that had never logged in, and obviously such users could never become adopted users. So we were right not to drop it.


Let's check out the correlation with our feature column.

In [59]:
df['adopted'].corr(df['last_session_creation_time'])

0.29693514312379066

We have a fairly high positive correlation, which makes sense.

In [60]:
df['adopted'].corr(df['last_session_creation_time'])

0.29693514312379066

In [61]:
df.corr()['adopted'].sort_values()

creation_source_PERSONAL_PROJECTS    -0.092150
creation_time                        -0.057155
creation_source_ORG_INVITE           -0.005401
enabled_for_marketing_drip            0.007497
opted_in_to_mailing_list              0.008956
creation_source_SIGNUP                0.016040
invited_by_user_id                    0.030837
creation_source_SIGNUP_GOOGLE_AUTH    0.034449
creation_source_GUEST_INVITE          0.053546
org_id                                0.076437
last_session_creation_time            0.296935
adopted                               1.000000
Name: adopted, dtype: float64

Small correlations. Also, correlations only capture linear relationships, which is why we use a random forest to capture more complex relationships.

Let's see what our accuracy is if we drop last_session_creation_time

In [62]:
#### Step 2) Split Data into Training and Test Sets
X = df.drop(['adopted', 'last_session_creation_time'], axis=1)
y = df['adopted']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

# Step 3: Instantiate the model/estimator
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
rf = RandomForestClassifier(random_state=0)

# Step 4: Specify a Hyperparameter space
param_grid_rf = {
                 'n_estimators': [50, 100],
                 'criterion': ["gini", "entropy"],
                 'max_depth': [3, 10, 20, None],
                 'min_samples_leaf': randint(1, 9),
                 'min_samples_split': randint(2, 9),
                 'max_features': randint(1, 9)
                }

# Step 5: Instantiate CV Object
rf_cv = RandomizedSearchCV(rf, param_grid_rf, cv=10, iid=False)

# Step 6: Fit CV Object to the training set
t0 = time.time()
rf_cv.fit(X_train, y_train)
ttt = time.time() - t0 # ttt stands for "time to train"
print("It takes %.3f" % ttt, " seconds for rf fitting")

# Step 7: Predict on the test set
y_pred = rf_cv.predict(X_test)

# Step 8: Score the model
print("Accuracy with RF on the test set is: %.3f" % rf_cv.score(X_test, y_test))

It takes 58.233  seconds for rf fitting
Accuracy with RF on the test set is: 0.813


Far lower.