In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
#load in csv's
users = pd.read_csv(r'takehome_users.csv', encoding='latin-1')
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [3]:
engagement = pd.read_csv(r'takehome_user_engagement.csv')
engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


Looking at the columns and what they are, let's drop columns we won't need. We care about adopted users (people who log in on 3 or more days a week) and how they came about in using the product or what they signed up for. So on just a first look basis we don't care about the name of the person or their email in the users df so let's get rid of those. All the other ones you can make an argument as to why those are important in user engagement or finding adopted users. The engagement df doesn't need to drop any columns since all those columns are useful. 

In [4]:
#drop columns that we won't need
users.drop(columns=['name', 'email'], inplace=True)

Now for our engagement df it looks like there are a lot of time stamps. The users df also had times but it was on a person to person basis. The engagement df has a df with organized time stamps that we can use to predict engagement or adopted users. So let's turn it into a datetime. 

In [5]:
engagement['time_stamp'] = pd.to_datetime(engagement['time_stamp'])
engagement.set_index(engagement['time_stamp'], inplace=True)
engagement.drop(columns=['time_stamp'], inplace=True)
engagement

Unnamed: 0_level_0,user_id,visited
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-04-22 03:53:30,1,1
2013-11-15 03:45:04,2,1
2013-11-29 03:45:04,2,1
2013-12-09 03:45:04,2,1
2013-12-25 03:45:04,2,1
...,...,...
2013-09-06 06:14:15,11996,1
2013-01-15 18:28:37,11997,1
2014-04-27 12:45:16,11998,1
2012-06-02 11:55:59,11999,1


Now that we have a datetime setup we can use it to look for adopted users. We're going to group the engagement by user_id in a weekly timestamp and then sum all the times the users visited the product during that week. From there we can write some code to look for people who visited 3 times or more in the week and grab those user id's.  

In [6]:
weekly_eng = engagement.groupby([pd.Grouper(freq='W'),'user_id']).sum()
weekly_eng

Unnamed: 0_level_0,Unnamed: 1_level_0,visited
time_stamp,user_id,Unnamed: 2_level_1
2012-06-03,563,1
2012-06-03,1693,1
2012-06-03,1995,1
2012-06-03,2120,1
2012-06-03,2136,1
...,...,...
2014-06-08,11869,1
2014-06-08,11885,1
2014-06-08,11895,3
2014-06-08,11906,1


In [7]:
adopted_users = weekly_eng[weekly_eng['visited'] >= 3].unstack(level=1).melt()
adopted_users = pd.DataFrame(adopted_users.user_id.unique(),index=range(adopted_users.user_id.unique().shape[0]),columns=['user_id'])
adopted_users.head(10)

Unnamed: 0,user_id
0,1693
1,728
2,11764
3,5297
4,6171
5,69
6,2078
7,3623
8,6978
9,7590


We have our user id's that we can see who are the adopted users of the product. For the next part we are going to merge them and leave the users at that. 

Next we will grab our features and apply PCA component analysis to see which of the features are the best features to use. 

In [8]:
df = users.merge(adopted_users, how='inner', left_on='object_id', right_on='user_id')
df.fillna(value=0, inplace=True)
df.head()

Unnamed: 0,object_id,creation_time,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,user_id
0,2,2013-11-15 03:45:04,ORG_INVITE,1396238000.0,0,0,1,316.0,2
1,10,2013-01-16 22:08:03,ORG_INVITE,1401833000.0,1,1,318,4143.0,10
2,20,2014-03-06 11:46:38,SIGNUP,1401364000.0,0,0,58,0.0,20
3,33,2014-03-11 06:29:09,GUEST_INVITE,1401518000.0,0,0,401,79.0,33
4,42,2012-11-11 19:05:07,SIGNUP,1401045000.0,1,0,235,0.0,42


In [9]:
#applying dummies to our creation source
dummy = pd.get_dummies(df['creation_source'])
features = pd.concat([df, dummy], axis=1)
features.drop(columns = ['creation_source', 'object_id', 'creation_time', 'user_id'], inplace=True)
features

Unnamed: 0,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,GUEST_INVITE,ORG_INVITE,PERSONAL_PROJECTS,SIGNUP,SIGNUP_GOOGLE_AUTH
0,1.396238e+09,0,0,1,316.0,0,1,0,0,0
1,1.401833e+09,1,1,318,4143.0,0,1,0,0,0
2,1.401364e+09,0,0,58,0.0,0,0,0,1,0
3,1.401518e+09,0,0,401,79.0,1,0,0,0,0
4,1.401045e+09,1,0,235,0.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
1440,1.400657e+09,0,0,65,11251.0,1,0,0,0,0
1441,1.401524e+09,0,0,15,5688.0,1,0,0,0,0
1442,1.401411e+09,1,1,52,6647.0,1,0,0,0,0
1443,1.400757e+09,1,0,31,6410.0,1,0,0,0,0


In [24]:
#scale the data and apply PCA to it
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
pca = PCA()
components = pca.fit_transform(scaled_features)
np.sum(pca.explained_variance_ratio_[0:8])

0.9735922673875433

In [25]:
#look at our pca component features
df_pca = pd.DataFrame(pca.components_,columns=features.columns)
pca_features = np.absolute(df_pca[np.absolute(df_pca) > 0.1])

In [23]:
pca_features.head(8).sum(axis=0).sort_values(ascending=False)

org_id                        1.792770
SIGNUP_GOOGLE_AUTH            1.705164
last_session_creation_time    1.700941
SIGNUP                        1.524752
PERSONAL_PROJECTS             1.438816
enabled_for_marketing_drip    1.405251
opted_in_to_mailing_list      1.401398
ORG_INVITE                    1.076612
GUEST_INVITE                  1.016688
invited_by_user_id            0.637653
dtype: float64

Based on our PCA analysis we can see that org_id accounts for the most variance out of all the other features and provides the most indication that a user will be an adopted user in the feature depending on their org_id. However there are some other features that are useful such as last_session_creation_time and enabled_for_marketing_drip which also account for a lot of the variance. 