In [1]:
#Package Import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from datetime import timedelta

In [2]:
#Importing CSV to datafram. 
engagement = pd.read_csv("takehome_user_engagement.csv", encoding = "utf-8")
users = pd.read_csv("takehome_users.csv", encoding = "latin")

In [3]:
#set the time_stamp to datetime and the set it as the index
engagement.time_stamp = pd.to_datetime(engagement.time_stamp)
engagement = engagement.set_index('time_stamp', drop= True)

In [4]:
#Custom function to determing what users have been active for 3 of the past 7 days.
def label_adopted(x):    
    df_temp = engagement.loc[engagement['user_id'] == x] 
    df_temp = df_temp.resample('D').mean().dropna() 
    adopted = 0
    for i in range(len(df_temp)-2): 
        if df_temp.index[i + 2] - df_temp.index[i] <= timedelta(days=7):
            adopted = 1
            break
        else:
            adopted = 0
    return adopted

In [5]:
#Creating a column of 'adopted users'
users['adopted_user'] = users['object_id'].apply(label_adopted)

In [6]:
print("Number of Adopted Users: ", sum(users['adopted_user']))
print("Percent of Adopted Users: ", round((sum(users.adopted_user)/len(users.adopted_user)*100),2),"%")

Number of Adopted Users:  1656
Percent of Adopted Users:  13.8 %


In [7]:
#Checking data types and null counts
display(engagement.info())
users.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 207917 entries, 2014-04-22 03:53:30 to 2014-01-26 08:57:12
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   user_id  207917 non-null  int64
 1   visited  207917 non-null  int64
dtypes: int64(2)
memory usage: 4.8 MB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
 10  adopted_user                12000 non-null  int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 1.0+ MB


In [8]:
#Checking first few rows of each
display(engagement.head())
users.head()

Unnamed: 0_level_0,user_id,visited
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-04-22 03:53:30,1,1
2013-11-15 03:45:04,2,1
2013-11-29 03:45:04,2,1
2013-12-09 03:45:04,2,1
2013-12-25 03:45:04,2,1


Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0


In [9]:
#Merging the two dataframes
merged = users.merge(engagement, how='outer', left_on='object_id', right_on='user_id')
merged

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user,user_id,visited
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1.398139e+09,1,0,11,10803.0,0,1.0,1.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1.396238e+09,0,0,1,316.0,1,2.0,1.0
2,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1.396238e+09,0,0,1,316.0,1,2.0,1.0
3,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1.396238e+09,0,0,1,316.0,1,2.0,1.0
4,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1.396238e+09,0,0,1,316.0,1,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
211089,11996,2013-09-06 06:14:15,Meier Sophia,SophiaMeier@gustr.com,ORG_INVITE,1.378448e+09,0,0,89,8263.0,0,11996.0,1.0
211090,11997,2013-01-10 18:28:37,Fisher Amelie,AmelieFisher@gmail.com,SIGNUP_GOOGLE_AUTH,1.358275e+09,0,0,200,,0,11997.0,1.0
211091,11998,2014-04-27 12:45:16,Haynes Jake,JakeHaynes@cuvox.de,GUEST_INVITE,1.398603e+09,1,1,83,8074.0,0,11998.0,1.0
211092,11999,2012-05-31 11:55:59,Faber Annett,mhaerzxp@iuxiw.com,PERSONAL_PROJECTS,1.338638e+09,0,0,6,,0,11999.0,1.0


In [10]:
#Rechecking column datatypes
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 211094 entries, 0 to 211093
Data columns (total 13 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   object_id                   211094 non-null  int64  
 1   creation_time               211094 non-null  object 
 2   name                        211094 non-null  object 
 3   email                       211094 non-null  object 
 4   creation_source             211094 non-null  object 
 5   last_session_creation_time  207917 non-null  float64
 6   opted_in_to_mailing_list    211094 non-null  int64  
 7   enabled_for_marketing_drip  211094 non-null  int64  
 8   org_id                      211094 non-null  int64  
 9   invited_by_user_id          118528 non-null  float64
 10  adopted_user                211094 non-null  int64  
 11  user_id                     207917 non-null  float64
 12  visited                     207917 non-null  float64
dtypes: float64(4),

In [12]:
#Handling missing values
merged['last_session_creation_time'].dropna(inplace=True)
merged['invited_by_user_id'].fillna(0, inplace=True)
merged['user_id'].fillna(0, inplace = True)
merged['visited'].fillna(0, inplace = True)

In [14]:
#Converting column data types
merged.creation_time=pd.to_datetime(merged.creation_time)
merged.last_session_creation_time = pd.to_datetime(merged.last_session_creation_time)

merged['invited_by_user_id'] = merged['invited_by_user_id'].astype('int')
merged['user_id'] = merged['user_id'].astype('int')
merged['visited'] = merged['visited'].astype('int')

In [15]:
from sklearn.preprocessing import LabelEncoder

gle = LabelEncoder()
creation_labels = gle.fit_transform(users['creation_source'])
feature_df.creation_source = creation_labels

org_id_labels = gle.fit_transform(users['org_id'])
feature_df.org_id = org_id_labels

invited_labels = gle.fit_transform(users['invited_by_user_id'])
feature_df.org_id = invited_labels

email_labels = gle.fit_transform(users['email_provider'])
feature_df.email_provider = email_labels

NameError: name 'feature_df' is not defined