In [1]:
# imports (that I may not even use!)

import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, roc_curve, auc, confusion_matrix, accuracy_score, precision_score, f1_score, fbeta_score, classification_report
from sklearn import ensemble
from sklearn import tree, metrics
from sklearn import preprocessing

In [2]:
# bring in data sets
users = pd.read_csv('takehome_users.csv')
data = pd.read_csv('takehome_user_engagement.csv')

In [3]:
# format timestamps properly
data['time_stamp'] = pd.to_datetime(data['time_stamp'], format='%Y-%m-%d %H:%M:%S')
data

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1
...,...,...,...
207912,2013-09-06 06:14:15,11996,1
207913,2013-01-15 18:28:37,11997,1
207914,2014-04-27 12:45:16,11998,1
207915,2012-06-02 11:55:59,11999,1


In [4]:
# Create df that only contains users with 3+ logins
adopteds = data['user_id'].value_counts()
adopteds = data[data.user_id.isin(adopteds.index[adopteds.gt(3)])]
adopteds['time_stamp'] = pd.to_datetime(adopteds['time_stamp'], format='%Y-%m-%d %H:%M:%S')
adopteds = pd.DataFrame(adopteds)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adopteds['time_stamp'] = pd.to_datetime(adopteds['time_stamp'], format='%Y-%m-%d %H:%M:%S')


In [5]:
# Create columns needed to calculate the number of login in a 7-day period

adopteds['year'] = adopteds['time_stamp'].dt.year
adopteds['month'] = adopteds['time_stamp'].dt.month
adopteds['day'] = adopteds['time_stamp'].dt.day
adopteds['dup_date'] = adopteds['year'].map(str) + adopteds['month'].map(str) + adopteds['day'].map(str)

adopteds['user_date_dup'] = adopteds['user_id'].map(str) + 'u' + adopteds['dup_date'].map(str)
adopteds = adopteds.drop(adopteds[['year','month','day','dup_date']], axis=1)
adopteds['user_id'].value_counts()

3623     606
906      600
1811     593
7590     590
8068     585
        ... 
9807       4
10520      4
6818       4
3917       4
9187       4
Name: user_id, Length: 2007, dtype: int64

In [6]:
# List of IDs of users with multiple logins

ids = list(adopteds.user_id.unique())
adopted_dict = {}

In [7]:
# Loop through users, identifying the adopted ones

for i in ids:
    data_user = data[data['user_id'] == i]
    size = data_user.shape[0]-2
    indexes = range(0,size)
    index_ctr = 0
    
    for x in list(indexes):
    
        index_ctr = index_ctr + 2

        if index_ctr != size:
            d1 = data_user['time_stamp'].iloc[x+1] - data_user['time_stamp'].iloc[x]
            d1 = d1.days
            d2 = data_user['time_stamp'].iloc[x+2] - data_user['time_stamp'].iloc[x+1]
            d2 = d2.days
            dsum = d1 + d2

            if dsum <= 7:
                adopted_dict[i] = 1
        

In [8]:
users.columns

Index(['object_id', 'creation_time', 'name', 'email', 'creation_source',
       'last_session_creation_time', 'opted_in_to_mailing_list',
       'enabled_for_marketing_drip', 'org_id', 'invited_by_user_id'],
      dtype='object')

In [9]:
users['adopted'] = users['object_id'].map(adopted_dict)

In [10]:
users['adopted'] = users['adopted'].fillna(0)

In [11]:
users_analysis = users.drop(['creation_time','last_session_creation_time','object_id','name', 'email'], axis=1)
users_analysis = pd.get_dummies(users_analysis, prefix=['source','org','invited_by'], columns=['creation_source','org_id','invited_by_user_id'])
#users_analysis = users_analysis.drop(['source_ORG_INVITE', 'org_1','invited_by_10741.0'], axis=1)
users_analysis

Unnamed: 0,opted_in_to_mailing_list,enabled_for_marketing_drip,adopted,source_GUEST_INVITE,source_ORG_INVITE,source_PERSONAL_PROJECTS,source_SIGNUP,source_SIGNUP_GOOGLE_AUTH,org_0,org_1,...,invited_by_11966.0,invited_by_11972.0,invited_by_11973.0,invited_by_11974.0,invited_by_11978.0,invited_by_11981.0,invited_by_11986.0,invited_by_11994.0,invited_by_11997.0,invited_by_11999.0
0,1,0,0.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1.0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0.0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,0,0,0.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11996,0,0,0.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
11997,1,1,0.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11998,0,0,0.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
users_features = users_analysis.drop('adopted', axis=1)
users_adopted = users_analysis.adopted
X_train, X_test, y_train, y_test = train_test_split(users_features, users_adopted,test_size=.25, random_state=8)

In [13]:
rf=ensemble.RandomForestClassifier(max_depth=3)

rf.fit(X_train, y_train)
y_pred = rf.predict_proba(X_test)[:,1]
y_predict = rf.predict(X_test)
y_predict

array([0., 0., 0., ..., 0., 0., 0.])

In [14]:

rf_fpr, rf_tpr, threshold = roc_curve(y_test, y_pred)
auc_rf = auc(rf_fpr, rf_tpr)
print(auc_rf)

0.5701231948360503


In [15]:
fi_rf = pd.DataFrame(rf.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance', ascending=False)
fi_rf.head(5)

Unnamed: 0,importance
source_ORG_INVITE,0.022706
source_GUEST_INVITE,0.022593
source_PERSONAL_PROJECTS,0.019971
invited_by_5855.0,0.019789
invited_by_6419.0,0.015527
