In [1]:
import pandas as pd

In [2]:
file_path = './'
users_df = pd.read_csv(file_path + 'takehome_users.csv', encoding='iso-8859-1')

In [4]:
engagement_df = pd.read_csv(file_path + 'takehome_user_engagement.csv',
                            parse_dates=True,
                            infer_datetime_format=True)

In [5]:
users_df.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [6]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [7]:
engagement_df.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [8]:
engagement_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null object
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [9]:
engagement_df['time_stamp'] = pd.to_datetime(engagement_df['time_stamp'])

In [10]:
engagement_df = engagement_df.set_index('time_stamp')

In [11]:
# Compute target variable
max_visits_7day = (engagement_df.groupby('user_id')['visited'].rolling(pd.Timedelta('7 days')).sum()
 ).groupby('user_id').max()
adopted_user = (max_visits_7day >= 3)

In [12]:
max_visits_7day.head()

user_id
1    1.0
2    3.0
3    1.0
4    1.0
5    1.0
Name: visited, dtype: float64

In [13]:
adopted_user.head()

user_id
1    False
2     True
3    False
4    False
5    False
Name: visited, dtype: bool

In [14]:
adopted_user = adopted_user.reindex(users_df.index).fillna(False)

In [15]:
users_df = users_df.set_index('object_id')

In [16]:
design = users_df.drop(['name', 'email', 'last_session_creation_time', 'creation_time'], axis=1)

In [17]:
design = design.join(
    pd.get_dummies(design['creation_source'])
).drop('creation_source', axis=1)

In [18]:
design['invited_by_user_id'] = design['invited_by_user_id'].fillna(0)

In [19]:
design.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 1 to 12000
Data columns (total 9 columns):
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            12000 non-null float64
GUEST_INVITE                  12000 non-null uint8
ORG_INVITE                    12000 non-null uint8
PERSONAL_PROJECTS             12000 non-null uint8
SIGNUP                        12000 non-null uint8
SIGNUP_GOOGLE_AUTH            12000 non-null uint8
dtypes: float64(1), int64(3), uint8(5)
memory usage: 847.3 KB


In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(design, adopted_user)

In [22]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression()
log.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
y_pred = log.predict(X_test)

In [24]:
log.score(X_test, y_test)

0.871

In [25]:
log.score(X_train, y_train)

0.865

In [27]:
pd.DataFrame(data=log.coef_.T, index=design.columns)

Unnamed: 0,0
opted_in_to_mailing_list,-0.085607
enabled_for_marketing_drip,0.039585
org_id,-0.000878
invited_by_user_id,-2.5e-05
GUEST_INVITE,-0.099803
ORG_INVITE,-0.251811
PERSONAL_PROJECTS,-0.39133
SIGNUP,-0.292693
SIGNUP_GOOGLE_AUTH,-0.262539


Those whose creation source is PERSONAL_PROJECTS are much less likely to remain active. ORG_INVITE and SIGNUP are negatively correlated as well. The other account creation reasons are not so predictive. In addition, those who opted into the mailing list are somewhat more likely to remain active.