In [1]:
import pandas as pd
import numpy as np

In [2]:
!ls

Relax Challenge Notebook.ipynb   takehome_user_engagement.csv
ml_dataset.csv                   takehome_users.csv
relax_data_science_challenge.pdf


In [3]:
!head -13 takehome_users.csv

object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398138810,1,0,11,10803
2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396237504,0,0,1,316
3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363734892,0,0,94,1525
4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210168,0,0,1,5151
5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358849660,0,0,193,5240
6,2013-12-17 03:37:06,Cunha Eduardo,EduardoPereiraCunha@yahoo.com,GUEST_INVITE,1387424226,0,0,197,11241
7,2012-12-16 13:24:32,Sewell Tyler,TylerSewell@jourrapide.com,SIGNUP,1356009872,0,1,37,
8,2013-07-31 05:34:02,Hamilton Danielle,DanielleHamilton@yahoo.com,PERSONAL_PROJECTS,,1,1,74,
9,2013-11-05 04:04:24,Amsel Paul,PaulAmsel@hotmail.com,PERSONA

In [4]:
users = pd.read_csv('takehome_users.csv', encoding='unicode_escape')
users['creation_time'] = pd.to_datetime(users['creation_time'])
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null datetime64[ns]
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(4), object(3)
memory usage: 937.6+ KB


In [5]:
engage = pd.read_csv('takehome_user_engagement.csv')
engage['time_stamp'] = pd.to_datetime(engage['time_stamp'])
engage.set_index('time_stamp', inplace=True)
engage.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 207917 entries, 2014-04-22 03:53:30 to 2014-01-26 08:57:12
Data columns (total 2 columns):
user_id    207917 non-null int64
visited    207917 non-null int64
dtypes: int64(2)
memory usage: 4.8 MB


In [6]:
users.head(1)

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0


In [7]:
engage.head(1)

Unnamed: 0_level_0,user_id,visited
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-04-22 03:53:30,1,1


Unsuccessful exploration:
- Attempted a groupby which generated a multiindex:
```python
engage_grouped = engage.groupby(by=['user_id', 'time_stamp']).count()
```
- Attempted a multiindex `Grouper()` function:
```python
def using_Grouper(df):
    level_values = df.index.get_level_values
    return (df.groupby([level_values(i) for i in [0,1]]
                       +[pd.Grouper(freq='1D', level=-1)]).sum())
```
- Attempted a stack and unstack scenario:
```python
df_stack = engage_grouped.unstack(level=0).resample('1D').sum().stack(level=1).swaplevel(1,0)
```

# Successful `pivot_table` method:

In [8]:
engage_pivot = engage.pivot_table(values='visited', index='time_stamp', columns='user_id', aggfunc='count')

In [9]:
engaged_1day = engage_pivot.resample('1D').sum()

In [10]:
for i in engaged_1day:
    if engaged_1day[i].max() > 1:
        print(i)

Therefore there are no users with more than one login during a 24 hour period. I will continue the aggregation to 7 days:

In [11]:
engaged_7day = engaged_1day.resample('7D').sum()

In [12]:
adopted_users = []
for i in engaged_7day:
    if engaged_7day[i].max() >= 3:
        adopted_users.append(i)

In [13]:
len(adopted_users)

1445

In [14]:
users_ml = users.drop(['creation_time', 
                       'name', 
                       'email', 
                       'last_session_creation_time', 
                       'org_id', 
                       'invited_by_user_id'], axis=1)

In [15]:
users_ml.columns = ['user_id', 'source', 'mailing', 'marketing']

In [16]:
users_dum = pd.get_dummies(users_ml)
users_dum.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 8 columns):
user_id                      12000 non-null int64
mailing                      12000 non-null int64
marketing                    12000 non-null int64
source_GUEST_INVITE          12000 non-null uint8
source_ORG_INVITE            12000 non-null uint8
source_PERSONAL_PROJECTS     12000 non-null uint8
source_SIGNUP                12000 non-null uint8
source_SIGNUP_GOOGLE_AUTH    12000 non-null uint8
dtypes: int64(3), uint8(5)
memory usage: 339.9 KB


In [17]:
adopted_df = pd.DataFrame(adopted_users)
adopted_df['adopted'] = 1
adopted_df.columns = ['user_id', 'adopted']
df = users_dum.merge(adopted_df, how='left', on='user_id')
df.fillna(value=0, inplace=True)
df.set_index('user_id', inplace=True)
df.to_csv('ml_dataset.csv')
df.head(1)

Unnamed: 0_level_0,mailing,marketing,source_GUEST_INVITE,source_ORG_INVITE,source_PERSONAL_PROJECTS,source_SIGNUP,source_SIGNUP_GOOGLE_AUTH,adopted
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,0,1,0,0,0,0,0.0


In [18]:
y = df['adopted'].values

In [19]:
X = df.drop('adopted', axis=1).values

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [21]:
clf = RandomForestClassifier(n_estimators=10000, max_depth=2, random_state=0)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [23]:
clf.fit(X_test, y_test)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [24]:
clf_proba = clf.predict_proba(X_test)
df_clf_proba = pd.DataFrame(clf_proba)

In [25]:
actual = pd.DataFrame(y_test)

In [26]:
df_clf_prediciton = df_clf_proba.merge(actual, left_index=True, right_index=True)
df_clf_prediciton.head(1)

Unnamed: 0,0_x,1,0_y
0,0.884834,0.115166,0.0


In [27]:
clf.feature_importances_

array([0.0687034 , 0.07487395, 0.12468303, 0.059639  , 0.34928103,
       0.0444965 , 0.27832308])

In [28]:
feat_import = zip(clf.feature_importances_, np.array(df[:6].columns))

In [29]:
feat_list = list(feat_import)

In [30]:
feat_list.sort(reverse=True)

In [31]:
feat_list

[(0.34928102843662445, 'source_PERSONAL_PROJECTS'),
 (0.2783230802513507, 'source_SIGNUP_GOOGLE_AUTH'),
 (0.12468303483643592, 'source_GUEST_INVITE'),
 (0.07487395068755807, 'marketing'),
 (0.06870339729638741, 'mailing'),
 (0.05963900490071129, 'source_ORG_INVITE'),
 (0.044496503590932025, 'source_SIGNUP')]

## The key predictors of user adoption are:
1. Users who sign up for **personal projects** `source_PERSONAL_PROJECTS` with a feature importance of 0.25
2. Users who sign up with their **Google accounts** `source_SIGNUP_GOOGLE_AUTH` with a feature importance of 0.20
3. A tie between users who sign up for **marketing** and **mailing** solicitations `marketing` + `mailing` both with a feature importance of 0.17

In [32]:
from sklearn.metrics import confusion_matrix

In [33]:
y_pred = clf.predict(X_test)

In [34]:
confusion_matrix(y_test, y_pred)

array([[3170,    0],
       [ 430,    0]])

![confusionMatrix](https://revolution-computing.typepad.com/.a/6a010534b1db25970b01bb08c97955970d-200wi)

In [35]:
from sklearn.linear_model import LogisticRegression

In [36]:
logit = LogisticRegression(solver='newton-cg', max_iter=11300)

In [37]:
logit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=11300, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

In [38]:
prob = logit.predict_proba(X_test)

In [39]:
y_pred = logit.predict(X_test)

In [40]:
confusion_matrix(y_test, y_pred)

array([[3170,    0],
       [ 430,    0]])