In [9]:
import numpy as np
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest as ztest

from authentication.authenticator import Authenticator
from sheets.sheetmanager import SheetManager

In [10]:
keys = 'credentials.json'
SCOPES = ['https://www.googleapis.com/auth/drive']

data_spreadsheetId = '1ZSZGAtYyGasHIbeCZYBvnN-Xv46hUs6-u5Larbqb0ag'
data_data_range = 'Teads Data'

codebook_spreadsheetId = '1ZSZGAtYyGasHIbeCZYBvnN-Xv46hUs6-u5Larbqb0ag'
codebook_data_range = 'Teads Codebook!A3:J1000'

In [11]:
authenticator = Authenticator(keys)
creds = authenticator.get_creds(SCOPES)
manager = SheetManager(creds)

In [12]:
data_df = manager.get_values(spreadsheetId=data_spreadsheetId,
                        data_range=data_data_range)

codebook_df = manager.get_values(spreadsheetId=codebook_spreadsheetId,
                                data_range=codebook_data_range)

In [13]:
data_df

Unnamed: 0,SAMPLEID,ANSWERDATE,SEX,AGE,AGEID,PREFECTURE,AREA,MARRIED,CHILD,HINCOME,...,Q6_6,Q6_7,Q7,Q8_1,Q8_2,Q8_3,Q8_4,Q8_5,Q8_6,Q9
0,9804,2019/12/28 10:41,1,56,10,14,3,2,1,3,...,0,1,5,0,0,0,0,1,0,6
1,55020,2019/12/27 20:42,1,50,9,14,3,1,1,3,...,0,1,6,,,,,,,5
2,56237,2019/12/28 22:59,1,63,11,27,5,1,1,2,...,0,1,3,,,,,,,6
3,74972,2019/12/27 20:39,1,64,11,13,3,2,2,7,...,0,0,5,0,0,0,0,1,0,6
4,199003,2019/12/27 21:32,1,60,11,23,4,2,2,3,...,0,0,3,,,,,,,6
5,213703,2019/12/28 13:16,2,48,8,13,3,1,1,4,...,1,0,5,0,0,0,0,0,1,5
6,219383,2019/12/29 0:36,2,47,8,14,3,2,2,,...,1,0,3,,,,,,,5
7,221364,2019/12/28 21:34,2,34,5,23,4,1,1,2,...,0,1,5,1,1,0,1,0,0,3
8,254848,2019/12/29 9:03,2,45,8,28,5,1,1,10,...,0,1,5,0,0,0,0,1,0,5
9,286991,2019/12/28 21:57,2,39,6,14,3,1,1,,...,0,1,5,0,0,0,0,0,1,4


In [14]:
cb_df = codebook_df.copy()

cb_df.replace('', np.nan, inplace=True)

# create new column to indicate question or answer
cb_df['Q/A'] = np.nan
cb_df.loc[~cb_df['Item name'].isna(), 'Q/A'] = 'Q'
cb_df.loc[cb_df['Q/A'] != 'Q', 'Q/A'] = 'A'

# we really only need Item name, label, Choice number, Question / choices, Q/A
cols = ['Item name', 'label', 'Choice number', 'Question / choices', 'Q/A']
cb_df = cb_df[cols]

cb_df['Item name'] = cb_df['Item name'].fillna(method='ffill')

cb_df.loc[~cb_df['Item name'].isna() & cb_df['label'].isna(), 'label'] = cb_df['Item name']

#cb_df[~cb_df['Item name'].isna() & cb_df['label'].isna()]
cb_df

Unnamed: 0,Item name,label,Choice number,Question / choices,Q/A
0,SAMPLEID,SAMPLEID,,回答者ID,Q
1,ANSWERDATE,ANSWERDATE,,回答日時,Q
2,SEX,SEX,,性別,Q
3,SEX,SEX,1.0,男性,A
4,SEX,SEX,2.0,女性,A
5,AGE,AGE,,年齢(才),Q
6,AGEID,AGEID,,年齢,Q
7,AGEID,AGEID,1.0,12才未満,A
8,AGEID,AGEID,2.0,12才～19才,A
9,AGEID,AGEID,3.0,20才～24才,A


In [15]:
questions = cb_df[['label', 'Question / choices']]
questions

Unnamed: 0,label,Question / choices
0,SAMPLEID,回答者ID
1,ANSWERDATE,回答日時
2,SEX,性別
3,SEX,男性
4,SEX,女性
5,AGE,年齢(才)
6,AGEID,年齢
7,AGEID,12才未満
8,AGEID,12才～19才
9,AGEID,20才～24才


In [17]:
def recode_SA_A(question_list):
    for q in question_list:
        df[q+'_r'] = 0
        df.loc[((df[q] == 1) | (df[q] == 2)), q+'_r'] = 1

In [16]:
overall_df = data_df.copy()

questions_to_recode = ['Q2', 'Q3', 'Q4']

recode_SA_A(questions_to_recode) 

overall_df['group'] = np.nan
overall_df.loc[df['CELL'] <= 8, 'group'] = 'EXP'
overall_df.loc[df['CELL'] >= 9, 'group'] = 'CON'

df = df[[*[q+'_r' for q in questions_to_recode], 'group']]

cut_dfs = []
for cut 

con_base = len(df[df['group'] == 'CON'])
exp_base = len(df[df['group'] == 'EXP'])

df = df.groupby('group').sum().transpose()
df['CON_base'] = con_base
df['EXP_base'] = exp_base

df['CON_desired_%'] = df['CON']/df['CON_base']
df['EXP_desired_%'] = df['EXP']/df['EXP_base']

df['abs_lift_%'] = df['EXP_desired_%'] - df['CON_desired_%']

df['p-value'] = df.apply(lambda x: ztest(
                                        [x['EXP'], x['CON']],
                                        [x['EXP_base'], x['CON_base']])[1], axis = 1)

df

group,CON,EXP,CON_base,EXP_base,CON_desired_%,EXP_desired_%,abs_lift_%,p-value
Q2_r,73,75,362,362,0.201657,0.207182,0.005525,0.853768
Q3_r,58,64,362,362,0.160221,0.176796,0.016575,0.551362
Q4_r,94,100,362,362,0.259669,0.276243,0.016575,0.614627


In [None]:
df.to_csv("result.csv")