## AB Testing for ShoeFly.com

In [1]:
import pandas as pd
import numpy as np

In [3]:
ad_clicks = pd.read_csv("ad_clicks.csv")
ad_clicks.head()

Unnamed: 0,user_id,utm_source,day,ad_click_timestamp,experimental_group
0,008b7c6c-7272-471e-b90e-930d548bd8d7,google,6 - Saturday,7:18,A
1,009abb94-5e14-4b6c-bb1c-4f4df7aa7557,facebook,7 - Sunday,,B
2,00f5d532-ed58-4570-b6d2-768df5f41aed,twitter,2 - Tuesday,,A
3,011adc64-0f44-4fd9-a0bb-f1506d2ad439,google,2 - Tuesday,,B
4,012137e6-7ae7-4649-af68-205b4702169c,facebook,7 - Sunday,,B


#### Exercise 1
Your manager wants to know which ad platform is getting you the most views.

How many views (i.e., rows of the table) came from each utm_source?

In [10]:
ad_clicks.groupby("utm_source")["user_id"].count().reset_index()

Unnamed: 0,utm_source,user_id
0,email,255
1,facebook,504
2,google,680
3,twitter,215


#### Exercise 2

If the column ad_click_timestamp is not null, then someone actually clicked on the ad that was displayed.

Create a new column called is_click, which is True if ad_click_timestamp is not null and False otherwise.

In [15]:
ad_clicks["is_click"] = ad_clicks["ad_click_timestamp"].apply(lambda x: True if x is not np.nan else False)
ad_clicks.head()

Unnamed: 0,user_id,utm_source,day,ad_click_timestamp,experimental_group,is_click
0,008b7c6c-7272-471e-b90e-930d548bd8d7,google,6 - Saturday,7:18,A,True
1,009abb94-5e14-4b6c-bb1c-4f4df7aa7557,facebook,7 - Sunday,,B,False
2,00f5d532-ed58-4570-b6d2-768df5f41aed,twitter,2 - Tuesday,,A,False
3,011adc64-0f44-4fd9-a0bb-f1506d2ad439,google,2 - Tuesday,,B,False
4,012137e6-7ae7-4649-af68-205b4702169c,facebook,7 - Sunday,,B,False


#### Exercise 3
We want to know the percent of people who clicked on ads from each utm_source.

Start by grouping by utm_source and is_click and counting the number of user_id‘s in each of those groups. Save your answer to the variable clicks_by_source.

In [18]:
clicks_by_source = ad_clicks.groupby(["utm_source", "is_click"])["user_id"].count().reset_index()
clicks_by_source

Unnamed: 0,utm_source,is_click,user_id
0,email,False,175
1,email,True,80
2,facebook,False,324
3,facebook,True,180
4,google,False,441
5,google,True,239
6,twitter,False,149
7,twitter,True,66


#### Exercise 4
Now let’s pivot the data so that the columns are is_click (either True or False), the index is utm_source, and the values are user_id.

Save your results to the variable clicks_pivot.

In [24]:
clicks_pivot = clicks_by_source.pivot(columns="is_click", index="utm_source", values="user_id").reset_index()
clicks_pivot

is_click,utm_source,False,True
0,email,175,80
1,facebook,324,180
2,google,441,239
3,twitter,149,66


#### Exercise 5
Create a new column in clicks_pivot called percent_clicked which is equal to the percent of users who clicked on the ad from each utm_source.

Was there a difference in click rates for each source?

In [54]:
clicks_pivot["percent_clicked"] = round(clicks_pivot[1] / clicks_pivot.sum(axis=1, numeric_only=True) * 100, 3)

In [55]:
clicks_pivot

is_click,utm_source,False,True,percent_clicked
0,email,175,80,28.244
1,facebook,324,180,33.489
2,google,441,239,33.497
3,twitter,149,66,27.245


#### Exercise 6
The column experimental_group tells us whether the user was shown Ad A or Ad B.

Were approximately the same number of people shown both ads?


In [60]:
ad_clicks["experimental_group"].value_counts().reset_index()

Unnamed: 0,index,experimental_group
0,A,827
1,B,827


#### Exercise 7
Using the column is_click that we defined earlier, check to see if a greater percentage of users clicked on Ad A or Ad B

In [75]:
pivot_AB = ad_clicks.groupby(["experimental_group", "is_click"])["user_id"].count()\
.reset_index().pivot(columns="is_click", index="experimental_group",values="user_id").reset_index()
pivot_AB

is_click,experimental_group,False,True
0,A,517,310
1,B,572,255


In [85]:
pivot_AB["click_percentage"] = round(pivot_AB[1] / pivot_AB.sum(axis=1, numeric_only=True) * 100, 3)
pivot_AB

is_click,experimental_group,False,True,click_percentage
0,A,517,310,35.86
1,B,572,255,29.726


#### Exercise 8

The Product Manager for the A/B test thinks that the clicks might have changed by day of the week.

Start by creating two DataFrames: a_clicks and b_clicks, which contain only the results for A group and B group, respectively.

In [89]:
a_clicks = ad_clicks[ad_clicks["experimental_group"] == "A"]
b_clicks = ad_clicks[ad_clicks["experimental_group"] == "B"]

#### Exercise 9
For each group (a_clicks and b_clicks), calculate the percent of users who clicked on the ad by day.

In [166]:
a_clicks_pivot = a_clicks.groupby(["day", "is_click"])["user_id"].count().reset_index().\
                 pivot(columns="is_click", index="day", values="user_id").reset_index()
a_clicks_pivot

is_click,day,False,True
0,1 - Monday,70,43
1,2 - Tuesday,76,43
2,3 - Wednesday,86,38
3,4 - Thursday,69,47
4,5 - Friday,77,51
5,6 - Saturday,73,45
6,7 - Sunday,66,43


In [173]:
a_clicks_pivot["clik_perc"]

0    43
1    43
2    38
3    47
4    51
5    45
6    43
Name: True, dtype: int64

In [203]:
a_clicks_pivot["click_perc"] = round(a_clicks_pivot[1] / a_clicks_pivot.sum(axis=1, numeric_only=True) * 100, 3)
a_clicks_pivot

is_click,day,False,True,click_perc
0,1 - Monday,70,43,30.073
1,2 - Tuesday,76,43,29.055
2,3 - Wednesday,86,38,25.434
3,4 - Thursday,69,47,31.818
4,5 - Friday,77,51,31.908
5,6 - Saturday,73,45,30.35
6,7 - Sunday,66,43,30.785


In [200]:
b_clicks_pivot = b_clicks.groupby(["day", "is_click"])["user_id"].count().reset_index().\
                 pivot(columns="is_click", index="day", values="user_id").reset_index()
b_clicks_pivot

is_click,day,False,True
0,1 - Monday,81,32
1,2 - Tuesday,74,45
2,3 - Wednesday,89,35
3,4 - Thursday,87,29
4,5 - Friday,90,38
5,6 - Saturday,76,42
6,7 - Sunday,75,34


In [205]:
b_clicks_pivot["click_perc"] = round(b_clicks_pivot[1] / b_clicks_pivot.sum(axis=1, numeric_only=True) * 100, 3)
b_clicks_pivot

is_click,day,False,True,click_perc
0,1 - Monday,81,32,28.319
1,2 - Tuesday,74,45,37.815
2,3 - Wednesday,89,35,28.226
3,4 - Thursday,87,29,25.0
4,5 - Friday,90,38,29.688
5,6 - Saturday,76,42,35.593
6,7 - Sunday,75,34,31.193


In [208]:
pd.merge(a_clicks_pivot, b_clicks_pivot, on="day",suffixes=("_A", "_B"))[["day", "click_perc_A", "click_perc_B"]]

is_click,day,click_perc_A,click_perc_B
0,1 - Monday,30.073,28.319
1,2 - Tuesday,29.055,37.815
2,3 - Wednesday,25.434,28.226
3,4 - Thursday,31.818,25.0
4,5 - Friday,31.908,29.688
5,6 - Saturday,30.35,35.593
6,7 - Sunday,30.785,31.193


#### Exercise 10
Compare the results for A and B. What happened over the course of the week?

Do you recommend that your company use Ad A or Ad B?

In [210]:
#4 days a week, B's click rate is higher than A's. If we look at the remaining days, 
#the click rates of 2 days A and B are very close. 
#Just 1 day A's click rate is too high compared to B. That's why I recommend A.