In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import researchpy as rp
from scipy import stats

In [2]:
# Read in CSVs
dfc = pd.read_csv('CurrentAd_new - CurrentAd_new.csv')
dfa = pd.read_csv('GroupA - GroupA.csv')

In [3]:
# Preview dfc
dfc.head()

Unnamed: 0,index,uuid,num_achievements,num_exercises,num_points,Click,Group,Button,Banner
0,0,981943945,37,166,254,1,CurrentAd,No,Yes
1,2,981944745,93,308,381,0,CurrentAd,No,Yes
2,3,981944801,91,260,306,0,CurrentAd,No,Yes
3,4,981944857,15,424,435,0,CurrentAd,No,Yes
4,6,981960025,41,293,374,0,CurrentAd,No,Yes


In [4]:
# Preview dfa
dfa.head()

Unnamed: 0,index,uuid,num_achievements,num_exercises,num_points,Click,Group,Button,Banner
0,8,981949649,0,0,0,0,A,No,No
1,15,981972297,0,0,0,0,A,No,No
2,22,981992089,0,0,0,0,A,No,No
3,32,981974489,0,0,0,0,A,No,No
4,38,981983905,0,0,0,0,A,No,No


In [5]:
# Combine the two tables into one since they contain all the same columns
df = pd.concat([dfc,dfa])

# Reset the index so each index value is unique
df.reset_index(drop = True, inplace = True)

# Drop index column because it isn't sequencial in favor of using the new index just created
df.drop(columns = 'index', inplace = True)

In [6]:
# View table to inspect results
df

Unnamed: 0,uuid,num_achievements,num_exercises,num_points,Click,Group,Button,Banner
0,981943945,37,166,254,1,CurrentAd,No,Yes
1,981944745,93,308,381,0,CurrentAd,No,Yes
2,981944801,91,260,306,0,CurrentAd,No,Yes
3,981944857,15,424,435,0,CurrentAd,No,Yes
4,981960025,41,293,374,0,CurrentAd,No,Yes
5,981954313,71,341,140,1,CurrentAd,No,Yes
6,981949961,60,547,18,0,CurrentAd,No,Yes
7,981952689,9,411,12,0,CurrentAd,No,Yes
8,982000809,65,62,247,0,CurrentAd,No,Yes
9,981990857,72,380,496,1,CurrentAd,No,Yes


In [7]:
# Check data types of each column in the table
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27268 entries, 0 to 27267
Data columns (total 8 columns):
uuid                27268 non-null int64
num_achievements    27268 non-null int64
num_exercises       27268 non-null int64
num_points          27268 non-null int64
Click               27268 non-null int64
Group               27268 non-null object
Button              27268 non-null object
Banner              27268 non-null object
dtypes: int64(5), object(3)
memory usage: 1.7+ MB


In [8]:
# Make a list of all columns in table that will be set as category data types
category_columns = 'Group Button Banner'.split()

# For loop for assiging the above listed columns as category data types
for i in category_columns:
    df[i] = df[i].astype('category')

In [9]:
# Summary of the two ad campaigns and how many clicks each got
rp.summary_cat(df[['Group', 'Click']])

Unnamed: 0,Variable,Outcome,Count,Percent
0,Group,CurrentAd,14125,51.8
1,,A,13143,48.2
2,Click,0,26337,96.59
3,,1,931,3.41


In [10]:
# Construct crosstab table in prep for Chi-squared test
crosstab = pd.crosstab(df['Group'], df['Click'])
crosstab

Click,0,1
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
CurrentAd,13727,398
A,12610,533


In [11]:
# Run Chi-squared test and store outputs
table, results = rp.crosstab(df['Click'], df['Group'], prop = 'col', test = 'chi-square')

# Return table with no click (0) or click (1) as a percentage of each group and overall
table

Unnamed: 0_level_0,Group,Group,Group
Unnamed: 0_level_1,A,CurrentAd,All
Click,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,95.94,97.18,96.59
1,4.06,2.82,3.41
All,100.0,100.0,100.0


In [12]:
# Return results of Chi-squared test 
results

# The results we care about in the table below are p-value and phi coefficient (in this table Cramer's phi)
# The p-value tells us that there is a statistically significant relationship between ad groups and clicks
# But the phi coefficient tells us that it is not a strong relationship

Unnamed: 0,Chi-square test,results
0,Pearson Chi-square ( 1.0) =,31.6261
1,p-value =,0.0
2,Cramer's phi =,0.0341


In [13]:
# Next I am taking a look at the two different user groups that were introducced to ad A and CurrentAd 
# and what we know about them

# Descriprive statistics for ad A
df[df['Group'] == 'A'].describe()

Unnamed: 0,uuid,num_achievements,num_exercises,num_points,Click
count,13143.0,13143.0,13143.0,13143.0,13143.0
mean,1034984000.0,1.029217,4.082553,4.082553,0.040554
std,66579320.0,0.894903,4.025102,4.025102,0.197262
min,961626700.0,0.0,0.0,0.0,0.0
25%,982169300.0,0.0,0.0,0.0,0.0
50%,983924000.0,1.0,2.0,2.0,0.0
75%,1099113000.0,2.0,8.0,8.0,0.0
max,1118685000.0,2.0,18.0,18.0,1.0


In [14]:
# Descriptive statistics for CurrentAd
df[df['Group'] == 'CurrentAd'].describe()

# It is worth noting that The two user groups are completely different by num_achievements, num_exercises,
# and num_points.

Unnamed: 0,uuid,num_achievements,num_exercises,num_points,Click
count,14125.0,14125.0,14125.0,14125.0,14125.0
mean,1044499000.0,49.536,346.300956,352.137204,0.028177
std,67714630.0,28.74235,201.306499,199.49987,0.165484
min,961626500.0,0.0,0.0,0.0,0.0
25%,982027800.0,25.0,174.0,181.0,0.0
50%,1097832000.0,50.0,346.0,353.0,0.0
75%,1099048000.0,74.0,519.0,525.0,0.0
max,1118159000.0,99.0,699.0,699.0,1.0
