In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read Data

In [31]:
users = pd.read_csv('data/takehome_users.csv',encoding='latin-1')

In [83]:
engagement =  pd.read_csv('data/takehome_user_engagement.csv')

In [32]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [33]:
engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


# Data Cleaning

Convert all time variables to datetime data type. For NaN values in 'last_session_creation_time', replaced with the correlated 'creation_time' values. Relace all NaN values in 'invited_by_user_id' with zeros.

In [34]:
users['last_session_creation_time'] = pd.to_datetime(users.last_session_creation_time,unit='s')
users['creation_time'] = pd.to_datetime(users.creation_time)
users['last_session_creation_time'] = users.last_session_creation_time.fillna(value=users.creation_time)
users['invited_by_user_id'] = users.invited_by_user_id.fillna(0).astype(int)

In [161]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240


In [85]:
engagement.time_stamp = pd.to_datetime(engagement.time_stamp)

# Create target variable

In [217]:
def adopted_label(timestamp):
    date = timestamp.apply(pd.Timestamp.date)   
    label = 0
    if len(date) >= 3:
        date_var = date.values
        for i in range(len(date_var)):
            if len(date_var) - 1 - i >= 2:               
                days_1_2 = date_var[i+1] - date_var[i]
                days_2_3 = date_var[i+2] - date_var[i+1]
                if days_1_2.days+days_2_3.days <= 6:
                    label = 1
    return label

In [143]:
users_adoption = engagement.groupby('user_id')[['time_stamp']].agg(adopted_label).reset_index()
users_adoption = users_adoption.rename(columns={"user_id": "object_id", "time_stamp": "adopted"})

In [156]:
y = users.join(users_adoption.set_index('object_id'), on='object_id',how='left').adopted.values

In [157]:
y = np.nan_to_num(y)
y = np.int64(y)

# Create features

In [200]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [170]:
f_mailing_list = users.opted_in_to_mailing_list.values
f_marketing_drip = users.enabled_for_marketing_drip.values
f_org_id = users.org_id.values
f_invited_by_user_id = users.invited_by_user_id.values

In [171]:
le = LabelEncoder()
f_creation_source = le.fit_transform(users.creation_source.values)

## Create a new feature 

We calculate the days from the user signup date to the latest login date, and use it as a new feature called 'login_days_from_signup'.

In [198]:
f_login_days_from_signup = users.last_session_creation_time-users.creation_time
f_login_days_from_signup = f_login_days_from_signup.apply(lambda x:x.days).values

## Create X 

In [203]:
feature_names = ['mailing_list','marketing_drip','org_id','invited_by_user_id','creation_source','login_days_from_signup']

In [199]:
X = np.column_stack((f_mailing_list, f_marketing_drip, f_org_id,
                     f_invited_by_user_id, f_creation_source, f_login_days_from_signup))

# Create training and test data

In [202]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=98)

# Build first classifier model with all possible features

In [204]:
from sklearn.ensemble import RandomForestClassifier

In [205]:
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
clf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

## Important weights of all features

In [206]:
for feature in zip(feature_names, clf.feature_importances_):
    print(feature)

('mailing_list', 0.004280877835779826)
('marketing_drip', 0.0031433968661101755)
('org_id', 0.061808569072747434)
('invited_by_user_id', 0.03365764832960511)
('creation_source', 0.012008912140092719)
('login_days_from_signup', 0.8851005957556641)


## Choose important features with SelectFromModel

In [207]:
from sklearn.feature_selection import SelectFromModel

In [208]:
selector = SelectFromModel(clf, threshold=0.1)
selector.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold=0.1)

In [209]:
for feature_list_index in selector.get_support(indices=True):
    print(feature_names[feature_list_index])

login_days_from_signup


The most important and qualified feature is 'login_days_from_signup'!

# Build a new model with the most important feature

In [210]:
X_sel_train = selector.transform(X_train)
X_sel_test = selector.transform(X_test)

In [211]:
clf_sel = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
clf_sel.fit(X_sel_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

# Comparing the performances of two models 

In [212]:
y_pred = clf.predict(X_test)
y_sel_pred = clf_sel.predict(X_sel_test)

In [213]:
from sklearn.metrics import classification_report

In [214]:
print ("clf Classification report: \n", classification_report(y_test, y_pred))

clf Classification report: 
              precision    recall  f1-score   support

          0       0.98      0.99      0.99      3131
          1       0.94      0.87      0.90       469

avg / total       0.97      0.97      0.97      3600



In [215]:
print ("clf Classification report: \n", classification_report(y_test, y_sel_pred))

clf Classification report: 
              precision    recall  f1-score   support

          0       0.98      0.99      0.98      3131
          1       0.92      0.86      0.89       469

avg / total       0.97      0.97      0.97      3600



Those two models have the near same performances!

# Conclution

'last_session_creation_time' and 'creation_time' are the factors predict future user adoption!