In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import data from csv files.
users_df = pd.read_csv('takehome_users.csv', encoding="ISO-8859-1")
engage_df = pd.read_csv('takehome_user_engagement.csv', parse_dates=["time_stamp"])

users_df = users_df.rename({"object_id":"user_id"}, axis=1)

In [3]:
users_df.head()

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [4]:
# Define 'adopted user'
adopted_df = engage_df.set_index("time_stamp")

users = adopted_df["user_id"].unique()
adoption = []

for i in users:
    filter_id = adopted_df["user_id"] == i
    df_filter = adopted_df[filter_id].resample("1D").count()
    df_filter = df_filter.rolling(window=7).sum()
    df_filter = df_filter.dropna()
    adoption.append(any(df_filter["visited"].values >= 7))

In [5]:
users_df.isnull().sum()

user_id                          0
creation_time                    0
name                             0
email                            0
creation_source                  0
last_session_creation_time    3177
opted_in_to_mailing_list         0
enabled_for_marketing_drip       0
org_id                           0
invited_by_user_id            5583
dtype: int64

In [6]:
user_adoption = list(zip(users, adoption))

adopt1_df = pd.DataFrame(user_adoption)
adopt1_df.columns = ["user_id", "adopted_user"]

df = users_df.merge(adopt1_df, on="user_id", how="left")

In [7]:
df.loc[:, "adopted_user"] = df["adopted_user"].map({False:0, True:1, np.nan:0})
df.dropna(subset=["adopted_user"], inplace=True)
df["adopted_user"] = df["adopted_user"].astype(int)

In [8]:
# Filling null values.
invite = lambda row: 0 if np.isnan(row) else 1
df["invited_by_user"] = df["invited_by_user_id"].apply(invite)

In [9]:
# Final dataframe for machine learning.
df = df[["adopted_user", "invited_by_user", "creation_source", "opted_in_to_mailing_list", "enabled_for_marketing_drip"]]
df.head()

Unnamed: 0,adopted_user,invited_by_user,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip
0,0,1,GUEST_INVITE,1,0
1,0,1,ORG_INVITE,0,0
2,0,1,ORG_INVITE,0,0
3,0,1,GUEST_INVITE,0,0
4,0,1,GUEST_INVITE,0,0


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

X = df[df.columns[1:]]
y = df[df.columns[0]]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.6, random_state=42)

pipeline = Pipeline(steps=[("encoder", OneHotEncoder()), \
                           ("rf", RandomForestClassifier(random_state = 42))])

params = {"rf__n_estimators" : [100, 200, 500],
          "rf__max_depth" : [10, 20, 50]}

cv = GridSearchCV(pipeline, param_grid=params, cv=3)
cv.fit(X_train, y_train)

print(f"Best parameters: {cv.best_params_}")
print(f"Training accuracy score from tuned model: \
       {cv.best_score_*100:.1f}%")

Best parameters: {'rf__max_depth': 10, 'rf__n_estimators': 100}
Training accuracy score from tuned model:        94.8%


In [11]:
# Test dataset score.
y_pred = cv.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {test_accuracy*100:.2f}%")

Model accuracy: 94.82%


In [12]:
# Re-running to get "labeled" feature importance

X_ohe = pd.get_dummies(X_test)
pipeline.fit(X_ohe, y_test)

fe = pipeline.named_steps["rf"].feature_importances_

feature_importance = zip(X_ohe.columns, fe)
feature_importance = sorted(feature_importance, key=lambda x:x[1], reverse=True)

for i, j in feature_importance:
    print(f"Weight: {j:.3f} | Feature: {i}")

Weight: 0.127 | Feature: creation_source_PERSONAL_PROJECTS
Weight: 0.110 | Feature: creation_source_GUEST_INVITE
Weight: 0.084 | Feature: enabled_for_marketing_drip
Weight: 0.074 | Feature: creation_source_ORG_INVITE
Weight: 0.055 | Feature: invited_by_user
Weight: 0.031 | Feature: creation_source_SIGNUP
Weight: 0.005 | Feature: creation_source_SIGNUP_GOOGLE_AUTH
Weight: 0.000 | Feature: opted_in_to_mailing_list


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


## **Recommendation**

Our model produced excellent results having a final accuracy metric comparable to the cross-validation training score (94.8% and 94.82% respectively). This means that our pipeline's feature ranking is likely a reliable predictor for user adoption. I used One-Hot encoding to examine specifically what the business could do to potentially boost the likelihood of user engagement:

  1. Because PERSONAL_PROJECTS and GUEST_INVITE rank highest on predicting future user adoption, the business could adjust marketing activities to focus more on highly-collaborative user groups.
  2. The marketing drip feature also ranked quite high so I recommend continuing this campaign to solidify and strengthen the user base.
  3. The other features may not have ranked as high, but that just means currently they are not the best predictors. This may mean either that the activities should be discontinued altogehter, or they may and so it doesn't quite matter when there is any emphasis on the newsletter call-to-action. At least it'll help the UI team to keep the app less commercial and enable a good vibe for the user.

