In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pylab as pl
import statsmodels.api as sm

#%matplotlib inline

In [3]:
#Put all csvs into Data Frames

users_df = pd.read_csv('takehome_users.csv')
user_engagement_df = pd.read_csv('takehome_user_engagement.csv')
adopted_users_df = pd.read_csv('adopted_users.csv')
size_org_joined_df = pd.read_csv('size_org_joined.csv')

In [4]:
# Make dummies out of the creation_source category variable
creation_source_dummies = pd.get_dummies(users_df['creation_source'], prefix='creation_source')
creation_source_dummies.head()

Unnamed: 0,creation_source_GUEST_INVITE,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH
0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0


In [5]:
# Make another column in users that segments weekday or not (0 = weekend, 1 = weekeday)
temp = users_df['creation_time'].apply(lambda x: pd.to_datetime(x).weekday())
users_df['weekday'] = temp.apply(lambda x: 1 if x < 5 else 0)
users_df.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,weekday
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,1
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,1
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,1
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,1


In [10]:
# Join 'cols to keep' on everything but the first series of dummies for creation_source
data = users_df.join(creation_source_dummies)
# Join the size of org when joined by the object_id
data = pd.merge(data, size_org_joined_df, on='object_id')
# Finally  add the dependent variable, adopted_users
data = pd.merge(adopted_users_df, data, left_on='all_users', right_on='object_id')

data.describe()

Unnamed: 0,all_users,adopted,object_id,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,weekday,creation_source_GUEST_INVITE,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH,org_size
count,11583.0,11583.0,11583.0,8503.0,11583.0,11583.0,11583.0,6201.0,11583.0,11583.0,11583.0,11583.0,11583.0,11583.0,11583.0
mean,5999.542433,0.109816,5999.542433,1380255000.0,0.24795,0.148752,139.50436,5967.517336,0.723388,0.178883,0.356471,0.175257,0.174307,0.115082,28.028231
std,3465.728531,0.312674,3465.728531,18731770.0,0.431841,0.35586,123.52896,3384.678441,0.447342,0.383271,0.478977,0.380203,0.37939,0.319135,40.62481
min,1.0,0.0,1.0,1338801000.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,3005.5,0.0,3005.5,1364862000.0,0.0,0.0,28.0,3037.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
50%,5991.0,0.0,5991.0,1383763000.0,0.0,0.0,105.0,5952.0,1.0,0.0,0.0,0.0,0.0,0.0,15.0
75%,9005.5,0.0,9005.5,1398606000.0,0.0,0.0,234.0,8830.0,1.0,0.0,1.0,0.0,0.0,0.0,29.0
max,12000.0,1.0,12000.0,1402067000.0,1.0,1.0,416.0,11999.0,1.0,1.0,1.0,1.0,1.0,1.0,318.0


In [11]:
data['intercept'] = 1

ind_var_cols = [
    'opted_in_to_mailing_list',
    'enabled_for_marketing_drip',
    'creation_source_GUEST_INVITE', 
    'creation_source_ORG_INVITE', 
    'creation_source_PERSONAL_PROJECTS',
    #'creation_source_SIGNUP', Removed first category dummy for basecase
    'creation_source_SIGNUP_GOOGLE_AUTH', 
    'org_size',
    'weekday',
    'intercept'
    ]
dep_var_col = 'adopted'

In [12]:
# Now to actually run the regession
logit = sm.Logit(data[dep_var_col], data[ind_var_cols])

# fit the model
result = logit.fit()

result.summary()

Optimization terminated successfully.
         Current function value: 0.337745
         Iterations 7


0,1,2,3
Dep. Variable:,adopted,No. Observations:,11583.0
Model:,Logit,Df Residuals:,11574.0
Method:,MLE,Df Model:,8.0
Date:,"Tue, 31 May 2016",Pseudo R-squ.:,0.02423
Time:,21:12:44,Log-Likelihood:,-3912.1
converged:,True,LL-Null:,-4009.2
,,LLR p-value:,1.0319999999999999e-37

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
opted_in_to_mailing_list,0.0451,0.079,0.571,0.568,-0.110 0.200
enabled_for_marketing_drip,-0.0488,0.097,-0.506,0.613,-0.238 0.140
creation_source_GUEST_INVITE,0.1388,0.094,1.474,0.140,-0.046 0.323
creation_source_ORG_INVITE,-0.1297,0.085,-1.522,0.128,-0.297 0.037
creation_source_PERSONAL_PROJECTS,-0.7129,0.114,-6.248,0.000,-0.937 -0.489
creation_source_SIGNUP_GOOGLE_AUTH,0.1118,0.106,1.051,0.293,-0.097 0.320
org_size,-0.0119,0.001,-8.887,0.000,-0.014 -0.009
weekday,-0.0749,0.066,-1.131,0.258,-0.205 0.055
intercept,-1.6711,0.088,-18.889,0.000,-1.845 -1.498


In [13]:
odds = 1/(1+np.exp(-result.params))
print odds

opted_in_to_mailing_list              0.511274
enabled_for_marketing_drip            0.487794
creation_source_GUEST_INVITE          0.534639
creation_source_ORG_INVITE            0.467617
creation_source_PERSONAL_PROJECTS     0.328957
creation_source_SIGNUP_GOOGLE_AUTH    0.527925
org_size                              0.497031
weekday                               0.481293
intercept                             0.158272
dtype: float64


In [44]:
result.df_resid

11575.0

From the above, the only significant variables (at conf. 95%) are:

* ~~opted_in_to_mailing_list	**0.568**~~	
* ~~enabled_for_marketing_drip	**0.613**~~
* ~~creation_source_GUEST_INVITE **0.140**~~	
* ~~creation_source_ORG_INVITE	**0.128**~~
* creation_source_PERSONAL_PROJECTS	**0.000**
* ~~creation_source_SIGNUP_GOOGLE_AUTH	**0.293**~~	
* org_size	**0.000**	
* ~~weekday		**0.258**~~	

In rough analysis, the only things we can say are that if a user signs up for personal project (rather than the base of just sign up at asana.com) or the size of the org is increased when they join, the log-odds are negatively correlated with if they will become an adopted user (negative COEF column)

The odds are 
* If a user joins through personal projects, the base odds will be multiplied by **0.328957** (A LOT less than 50%)
* For every unit bigger an org is when a user joins, the base odds will be multiplied by **0.497031** (VERY close to 50% so not too bad, but a 100 unit increase is **0.497031^100 = 4.348582041E-31** so definitely drops odds)