In [1]:
import pandas as pd
import numpy as np
user_engage=pd.read_csv('takehome_user_engagement.csv', encoding='latin-1')
users=pd.read_csv('takehome_users.csv',encoding='latin-1')

In [2]:
user_engage.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [3]:
user_engage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null object
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


No null values, so that's good. Now for analysis purposes focusing on the date, it is always good for time_stamp to be in a date time format. This can be done using panda.

In [4]:
user_engage['time_stamp']=pd.to_datetime(user_engage['time_stamp'])
user_engage.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [5]:
user_engage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null datetime64[ns]
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: datetime64[ns](1), int64(2)
memory usage: 4.8 MB


In [6]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [7]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


Two elements, last_session_creation_time and invited_by_user_id from the columns of dataset above, exhibit null values.
- last_session_creation_time was the time the user last login, there is some NaN values because some users could have turned off tracking
- invited_by_user_id can be zero for user that received no invites, but rather the user signed up for the product by themselves.

# Problem Statement:

Defining an "adopted user" as a user who has logged into the product on three separate days in at least one seven­day period, identify which factors predict future user adoption. Please send us a brief writeup of your findings (the more concise, the better ­­ no more than one page), along with any summary tables, graphs, code, or queries that can help us understand your approach. Please note any factors you considered or investigation you did, even if they did not pan out. Feel free to identify any further research or data you think would be valuable.

In [8]:
# from user_engage, calculate the number of visit from a specific user
visits = pd.DataFrame(user_engage['user_id'].value_counts())
visits.reset_index(inplace=True)
visits.columns = ['user_id','times_visit']
visits.head()

Unnamed: 0,user_id,times_visit
0,3623,606
1,906,600
2,1811,593
3,7590,590
4,8068,585


As it was defined above, the adopted users are those who visited for more than 3 days. Hence we can now filter the user_engage dataset to those with more than 3 visits. 

In [9]:
regular_user = user_engage[user_engage['user_id'].isin(visits[visits['times_visit'] >= 3]['user_id'])] 
regular_user.head()

Unnamed: 0,time_stamp,user_id,visited
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1
5,2013-12-31 03:45:04,2,1


With the filtered list, we still need to factor ' in at least one sevenday period' to be able to consider them as adopted user

In [10]:
#initializer for new element
adopted_users = []
for user in regular_user.user_id.unique(): 
    time = regular_user[regular_user.user_id == user]['time_stamp'].reset_index()['time_stamp']
    for x in range(len(time) - 2):
        if time[x + 2] - time[x] < pd.Timedelta('7 days'):
            adopted_users.append(user)
            break

The output of the above code should give a list of the adopted users. Their full info is in the users dataset where users_id =object_id, but it is mixed with the non-adopted users. What we can do is to create a binary value to differentiate the two types of user

In [11]:
users['adopted'] = users.apply(lambda x: x['object_id'] in adopted_users,axis=1)
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,False
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,True
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,False
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,False
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,False


Before applying machine learning techniques to this dataset, we have to ensure the null values are filled.

In [12]:
users['last_session_creation_time'].fillna(0, inplace=True)
users['invited_by_user_id'].fillna(0, inplace=True)

In [13]:
users['creation_source']= users['creation_source'].apply(lambda x: x=='GUEST_INVITE')

In [14]:
users['creation_time'] = pd.to_datetime(users['creation_time'])
users['creation_hour']=pd.to_datetime(users.creation_time).dt.hour
users['creation_day']=pd.to_datetime(users.creation_time).dt.day
users['creation_month']=pd.to_datetime(users.creation_time).dt.month
users['creation_year']=pd.to_datetime(users.creation_time).dt.day

In [15]:
from datetime import datetime, timedelta
users['last_session_creation_time'] = users['last_session_creation_time'].map(lambda x:datetime.fromtimestamp(int(x)).strftime( '%Y-%m-%d %H:%M:%S'))

In [16]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted,creation_hour,creation_day,creation_month,creation_year
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,True,2014-04-21 20:53:30,1,0,11,10803.0,False,3,22,4,22
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,False,2014-03-30 20:45:04,0,0,1,316.0,True,3,15,11,15
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,False,2013-03-19 16:14:52,0,0,94,1525.0,False,23,19,3,19
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,True,2013-05-22 01:09:28,0,0,1,5151.0,False,8,21,5,21
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,True,2013-01-22 02:14:20,0,0,193,5240.0,False,10,17,1,17


In [17]:
target_variable = users.drop(['adopted'], axis=1)
feature_variable = users['adopted']

In [19]:
from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(users, test_size=0.2, random_state=100)
train, test = train_test_split(users, test_size=0.2, random_state=100)

In [20]:
#train dataset with the target and feature variables 
import lightgbm as lgb
lgb_train_df = lgb.Dataset(train[['creation_hour','creation_year','creation_month','creation_day','creation_source','last_session_creation_time','opted_in_to_mailing_list',
            'enabled_for_marketing_drip','org_id','invited_by_user_id'] ],
                      label=train['adopted'],
                      categorical_feature=['creation_source','org_id','invited_by_user_id','creation_hour','creation_year','creation_month','creation_day'])

In [None]:
#mdl =lgb.LGBMClassifier(boosting_type ='gbdt',objective ='binary', num_leaves =100)
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'AUC'},
    'num_leaves': 100,
    'max_depth': 10,
}
mdl = lgb.train(params, lgb_train_df, 100)

could not continue due to error:
'ValueError: DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields last_session_creation_time'

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [21]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted,creation_hour,creation_day,creation_month,creation_year
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,True,2014-04-21 20:53:30,1,0,11,10803.0,False,3,22,4,22
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,False,2014-03-30 20:45:04,0,0,1,316.0,True,3,15,11,15
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,False,2013-03-19 16:14:52,0,0,94,1525.0,False,23,19,3,19
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,True,2013-05-22 01:09:28,0,0,1,5151.0,False,8,21,5,21
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,True,2013-01-22 02:14:20,0,0,193,5240.0,False,10,17,1,17


Conclusion: for creation_time and last_session_creation_time, it can be put into comparison the time difference between them. For the time difference more than 3 dats, chances are the creation source is the feature importance leading to a user leaning towards becoming an adopted user. However this is not always true. As seen on the second rowm Matthew Poole is an adopted user with time between creation_time and last_session_creation_time greater than 3 months and the source of his access to the product was by invite. Thus features importance of the users dataset is related to the time: most importantly the last_session_creation_time and creation time as well as the creation source.