In [43]:
# regular imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Stats test
from  scipy.stats import chisquare
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind

In [2]:
# import data 
takehome_user_engagement = pd.read_csv("takehome_user_engagement_2.csv")

In [3]:
# importing as a CSV had issues so I saved the doc as an Excel file and imported that
takehome_users = pd.read_excel("takehome_users_.xls")

### Take a look at the data 

In [4]:
# we see there are 207,917 days of engagement 
takehome_user_engagement.shape

(207917, 3)

In [5]:
takehome_users.shape

(12000, 10)

In [6]:
# we have 12k users total in the takehome_users dataframe
takehome_users.object_id.nunique()

12000

In [7]:
takehome_users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:00,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:00,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:00,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:00,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:00,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [8]:
takehome_user_engagement.head(2)

Unnamed: 0,time_stamp,user_id,visited
0,4/22/2014 3:53,1,1
1,11/15/2013 3:45,2,1


In [9]:
# We have 8823 users who engaged at all 
takehome_user_engagement.user_id.nunique()

8823

In [10]:
takehome_user_engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null object
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [11]:
# Turn the time stamp to a date column 
takehome_user_engagement["date"] = pd.to_datetime(takehome_user_engagement["time_stamp"])

In [12]:
takehome_user_engagement.head(4)

Unnamed: 0,time_stamp,user_id,visited,date
0,4/22/2014 3:53,1,1,2014-04-22 03:53:00
1,11/15/2013 3:45,2,1,2013-11-15 03:45:00
2,11/29/2013 3:45,2,1,2013-11-29 03:45:00
3,12/9/2013 3:45,2,1,2013-12-09 03:45:00


In [13]:
takehome_user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited,date
0,4/22/2014 3:53,1,1,2014-04-22 03:53:00
1,11/15/2013 3:45,2,1,2013-11-15 03:45:00
2,11/29/2013 3:45,2,1,2013-11-29 03:45:00
3,12/9/2013 3:45,2,1,2013-12-09 03:45:00
4,12/25/2013 3:45,2,1,2013-12-25 03:45:00


#### We only care about the date not the time so we remove that

In [14]:
takehome_user_engagement["date"] = takehome_user_engagement["date"].apply(lambda x: x.date())

In [15]:
# from this we see that the visited column is not needed
takehome_user_engagement.visited.value_counts()

1    207917
Name: visited, dtype: int64

In [16]:
takehome_user_engagement = takehome_user_engagement[["user_id", "date"]]

In [17]:
takehome_user_engagement.head()

Unnamed: 0,user_id,date
0,1,2014-04-22
1,2,2013-11-15
2,2,2013-11-29
3,2,2013-12-09
4,2,2013-12-25


In [18]:
# we only care about if they visited once in a day so we drop duplicates. 
takehome_user_engagement.drop_duplicates(inplace=True)

In [19]:
takehome_user_engagement.head()

Unnamed: 0,user_id,date
0,1,2014-04-22
1,2,2013-11-15
2,2,2013-11-29
3,2,2013-12-09
4,2,2013-12-25


In [20]:
# adopted users have to have at least three visits so we remove users with less than three visits. 
v_c= takehome_user_engagement.user_id.value_counts().to_frame()
potential_users= v_c[v_c["user_id"] > 2].index

In [21]:
takehome_user_engagement = 
takehome_user_engagement[takehome_user_engagement["user_id"].isin(potential_users)]

In [22]:
takehome_user_engagement.user_id.nunique()

2248

### From the original 8,823 with user engagement only 2,248 could be adopted users

###  Next create a method to figure out which of these users are adopted. 

In [30]:
# we take one sample user
sample = takehome_user_engagement[takehome_user_engagement["user_id"] == 2]

sample = sample.reset_index()

# by taking the date column and subtracking the previous date from two periods earlier we can
#see if they had three sessions in a week period.
sample["time_diff"]= (sample['date']-sample['date'].shift(periods=2))
sample

Unnamed: 0,index,user_id,date,time_diff
0,1,2,2013-11-15,NaT
1,2,2,2013-11-29,NaT
2,3,2,2013-12-09,24 days
3,4,2,2013-12-25,26 days
4,5,2,2013-12-31,22 days
5,6,2,2014-01-08,14 days
6,7,2,2014-02-03,34 days
7,8,2,2014-02-08,31 days
8,9,2,2014-02-09,6 days
9,10,2,2014-02-13,5 days


In [29]:
# This will create a list of True or False for the users visits. 
# a true will only appear if they visited at least three times in 7 days, 
#thus indicating that the user is an adopted user. 
t_list= list(sample['time_diff']<"8 days")
t_list

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 False,
 False,
 False]

In [31]:
def find_adopted_users(df):
    ''' Function to find all the adopted users'''
    # create a list of possible users 
    users = list(df.user_id.unique())
    # create an empty list for the adopted users 
    adopted_users_list=[]
    
    # run through each potential user in the dataframe
    for user in users:
        sample = df[df["user_id"] == user].reset_index()
        # check for time between every three visits
        sample["time_diff"]= (sample['date']-sample['date'].shift(periods=2))
        # check if the time is within a week
        t_list= list(sample['time_diff']<"8 days")
        # if they had three visits in a week, add them to the adopted_users_list 
        if True in t_list:
            adopted_users_list.append(user)
    return adopted_users_list

In [32]:
adopted_users = find_adopted_users(takehome_user_engagement)

In [33]:
len(adopted_users)

1656

#### Only 1,656 users are adopted users. 

In [35]:
# Create a new column which indicates if a user is adopted
takehome_users["adopted_users"] = takehome_users["object_id"].isin(adopted_users) 

In [36]:
# take a look at the new column
takehome_users[["name",'object_id','adopted_users']].head()

Unnamed: 0,name,object_id,adopted_users
0,Clausen August,1,False
1,Poole Matthew,2,True
2,Bottrill Mitchell,3,False
3,Clausen Nicklas,4,False
4,Raw Grace,5,False


In [37]:
# Create new column for year customer joined 
takehome_users["creation_year"] = takehome_users["creation_time"].apply(lambda x:x.year)

In [38]:
# create new column for month customer joined
takehome_users["creation_month"] = takehome_users["creation_time"].apply(lambda x:x.month)

In [39]:
takehome_users.adopted_users.value_counts()

False    10344
True      1656
Name: adopted_users, dtype: int64

In [40]:
# 13.8% of all users are adopted users 
1656/12000*100

13.8

In [55]:
# 18.8% of all users who logged on are adopted users  
1656/8823*100

18.769126147568855

In [56]:
# 73.7% of all users who had three or more vists are adopted users 
1656/2248*100

73.66548042704626

### 13.8% of Users are Adopted Users

### 18.8% Users who ever logged on are Adopted Users 
### 73.7% Users who had three or more vists are Adopted Users 

In [41]:
takehome_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 13 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null datetime64[ns]
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
adopted_users                 12000 non-null bool
creation_year                 12000 non-null int64
creation_month                12000 non-null int64
dtypes: bool(1), datetime64[ns](1), float64(2), int64(6), object(3)
memory usage: 1.1+ MB


### Compare the adopted users to the non-adopted users

In [49]:
def t_test_function(df,var1,var2):
    '''t-test to see if a difference in percentage by variables is statistically significant. '''
    df1 = df[df[var1] == True]
    df2 = df[df[var1] == False]
    return ttest_ind(df1[[var2]],df2[[var2]])

In [54]:
def chi_cont_func(df, col1,col2):
    '''Chi-Squared test to find statistical significance for varied categorical data'''
    a=df[df[col1]==0].groupby(col2)[[col1]].count()
    b= df[df[col1]==1].groupby(col2)[[col1]].count()
    data_array= np.concatenate((a.values.T,b.values.T),axis=0)
    return chi2_contingency(data_array)

#### examine percent of takehome_users who opted in to the mailling list

In [50]:
takehome_users.groupby("adopted_users")[["opted_in_to_mailing_list"]].mean()* 100

Unnamed: 0_level_0,opted_in_to_mailing_list
adopted_users,Unnamed: 1_level_1
False,24.796984
True,25.905797


In [51]:
t_test_function(takehome_users,'adopted_users','opted_in_to_mailing_list')

Ttest_indResult(statistic=array([0.96808254]), pvalue=array([0.33302266]))

#### The diffence here is only 1.1% and witha p-value of 0.33 it is clearly not significant. 

### Examine percent of takehome_users who enabled for marketing drip

In [68]:
takehome_users.groupby("adopted_users")[["enabled_for_marketing_drip"]].mean()* 100

Unnamed: 0_level_0,enabled_for_marketing_drip
adopted_users,Unnamed: 1_level_1
False,14.83952
True,15.519324


In [52]:
t_test_function(takehome_users,'adopted_users','enabled_for_marketing_drip')

Ttest_indResult(statistic=array([0.72057983]), pvalue=array([0.4711821]))

#### The diffence here is only 0.7% and witha p-value of 0.47 it is clearly not significant. 

### Examine adoption rate by creation year

In [128]:
takehome_users.groupby("creation_year")[["adopted_users"]].mean()* 100

Unnamed: 0_level_0,adopted_users
creation_year,Unnamed: 1_level_1
2012,16.732213
2013,15.415786
2014,8.874398


In [56]:
chi_cont_func(df=takehome_users, col1='adopted_users',col2='creation_year')

(104.60798001682231,
 1.9260420201466773e-23,
 2,
 array([[2411.014, 4892.712, 3040.274],
        [ 385.986,  783.288,  486.726]]))

#### Creation year is statistically significant with p-value<0.001 and has max difference of 7.8%

## Creation Source Evaluated

In [115]:
takehome_users.groupby("creation_source")[["adopted_users"]].mean()* 100

Unnamed: 0_level_0,adopted_users
creation_source,Unnamed: 1_level_1
GUEST_INVITE,17.059639
ORG_INVITE,13.493183
PERSONAL_PROJECTS,8.147797
SIGNUP,14.470532
SIGNUP_GOOGLE_AUTH,17.256318


In [62]:
chi_cont_func(df=takehome_users, col1='adopted_users',col2='creation_source')

(91.04842041636412,
 7.884346873316259e-19,
 4,
 array([[1864.506, 3666.948, 1819.682, 1798.994, 1193.87 ],
        [ 298.494,  587.052,  291.318,  288.006,  191.13 ]]))

#### Creation Source is statistically significant with p-value<0.001 and has max difference of 8.9%

## examine creation month's affect on a users adoption

In [130]:
cray_mo = takehome_users.groupby("creation_month")[["adopted_users"]].mean()* 100
cray_mo.reset_index().sort_values('adopted_users', ascending=False )

Unnamed: 0,creation_month,adopted_users
5,6,18.192628
9,10,17.532468
7,8,17.018779
10,11,16.132479
8,9,15.760266
1,2,14.668094
11,12,14.623656
0,1,14.577259
6,7,14.43662
2,3,13.93373


In [63]:
chi_cont_func(df=takehome_users, col1='adopted_users',col2='creation_month')

(143.56913401741423,
 3.057306456267819e-25,
 11,
 array([[ 886.998,  805.108, 1014.574, 1007.678, 1254.21 ,  724.942,
          734.424,  734.424,  776.662,  796.488,  806.832,  801.66 ],
        [ 142.002,  128.892,  162.426,  161.322,  200.79 ,  116.058,
          117.576,  117.576,  124.338,  127.512,  129.168,  128.34 ]]))

#### Creation Source is statistically significant with p-value < 0.001 and has max difference of 12.4%

## Examine org_id  

In [61]:
org_id_df = takehome_users.groupby("org_id")[["adopted_users"]].mean()* 100
org_id_df.reset_index().sort_values('adopted_users', ascending=False ).head(3)

Unnamed: 0,org_id,adopted_users
387,387,58.333333
235,235,46.153846
270,270,42.857143


In [60]:
org_id_df.reset_index().sort_values('adopted_users', ascending=False ).tail(3)

Unnamed: 0,org_id,adopted_users
355,355,0.0
365,365,0.0
416,416,0.0


In [66]:
takehome_users.org_id.value_counts().to_frame().reset_index().org_id.value_counts()

17     35
23     23
24     22
19     22
15     21
16     20
18     20
20     18
22     15
14     13
25     12
28     12
12     12
13     11
21     11
29     11
27     10
26      9
40      7
30      7
35      6
31      5
10      5
39      5
38      5
48      4
9       4
45      3
34      3
32      3
       ..
319     1
41      1
68      1
201     1
168     1
159     1
138     1
128     1
124     1
119     1
104     1
97      1
87      1
74      1
73      1
64      1
233     1
63      1
62      1
61      1
60      1
59      1
58      1
56      1
55      1
54      1
52      1
49      1
47      1
2       1
Name: org_id, Length: 75, dtype: int64

## invited_by_user_id  

In [69]:
org_id_df = takehome_users.groupby("invited_by_user_id")[["adopted_users"]].mean()* 100
org_id_df.reset_index().sort_values('adopted_users', ascending=False ).head(3)

Unnamed: 0,invited_by_user_id,adopted_users
1080,5022.0,100.0
180,907.0,100.0
1292,5925.0,100.0


In [71]:
org_id_df.reset_index().sort_values('adopted_users', ascending=False ).tail(3)

Unnamed: 0,invited_by_user_id,adopted_users
993,4615.0,0.0
991,4604.0,0.0
2563,11999.0,0.0


In [135]:
takehome_users.invited_by_user_id.value_counts().to_frame().reset_index().invited_by_user_id.value_counts()

1     1097
2      527
3      347
4      205
5      165
6       98
7       60
8       36
9       14
10      10
11       3
13       1
12       1
Name: invited_by_user_id, dtype: int64

#### Given the fact that the range of invites by any single user was 1-12, this feature is not statistically helpful in any way as a predictive feature. 