In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
user_engagement_data_path = 'https://raw.githubusercontent.com/dqminhv/Springboard_RelaxInc/main/Data/takehome_user_engagement.csv'
users_data_path = 'https://raw.githubusercontent.com/dqminhv/Springboard_RelaxInc/main/Data/takehome_users.csv'

In [42]:
#Load users data to a dataframe
users = pd.read_csv(users_data_path, encoding='ISO-8859-1')

In [43]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [11]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [12]:
#Load user engagement to a dataframe
user_engagement = pd.read_csv(user_engagement_data_path)

In [13]:
user_engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [14]:
#Convert time_stamp column to datetime objects
user_engagement['time_stamp'] = pd.to_datetime(user_engagement['time_stamp'])

In [29]:
from datetime import timedelta

# Sort the DataFrame by user_id and timestamp
user_engagement.sort_values(by=['user_id', 'time_stamp'], inplace=True)

# Create a new column to represent the date (without time)
user_engagement['date'] = user_engagement['time_stamp'].dt.date

# Define a function to check if a user has logged in on three separate days within a 7-day period
def is_adopted(user_logins):
    login_dates = set(user_logins['date'])
    
    for i, login_date in enumerate(login_dates):
        # Check if there are at least two more login dates within the next 6 days
        next_dates = login_dates.intersection(user_logins['date'][i+1:i+3])
        if any(date - login_date <= timedelta(days=6) for date in next_dates):
            return True
    
    return False

In [36]:
adopted_users = user_engagement.groupby('user_id').filter(lambda x: is_adopted(x)).groupby('user_id').agg({'user_id':'first'})

In [38]:
adopted_users

Unnamed: 0_level_0,user_id
user_id,Unnamed: 1_level_1
2,2
10,10
19,19
20,20
25,25
...,...
11975,11975
11980,11980
11981,11981
11988,11988


In [44]:
#Merge the users dataframe with the adopted_users dataframe
users = pd.merge(users, adopted_users,
                left_on = 'object_id',
                right_on = adopted_users.index,
                how = 'left')

In [47]:
#Rename the user_id column to is_adopted
users.rename(columns={'user_id':'is_adopted'}, inplace=True)

In [49]:
users['is_adopted'] = users['is_adopted'].notna().astype(int)

In [50]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,is_adopted
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0
