In [1]:
import numpy as np
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest as ztest

from authentication.authenticator import Authenticator
from sheets.sheetmanager import SheetManager

In [2]:
keys = 'credentials.json'
SCOPES = ['https://www.googleapis.com/auth/drive']

data_spreadsheetId = '1ZSZGAtYyGasHIbeCZYBvnN-Xv46hUs6-u5Larbqb0ag'
data_data_range = 'Teads Data'

codebook_spreadsheetId = '1ZSZGAtYyGasHIbeCZYBvnN-Xv46hUs6-u5Larbqb0ag'
codebook_data_range = 'Teads Codebook!A3:J1000'

output_spreadsheetId = '1ZSZGAtYyGasHIbeCZYBvnN-Xv46hUs6-u5Larbqb0ag'
output_data_range = 'Teads Results'

In [3]:
authenticator = Authenticator(keys)
creds = authenticator.get_creds(SCOPES)
manager = SheetManager(creds)

In [4]:
data_df = manager.get_values(spreadsheetId=data_spreadsheetId,
                        data_range=data_data_range)

codebook_df = manager.get_values(spreadsheetId=codebook_spreadsheetId,
                                data_range=codebook_data_range)

In [5]:
data_df.columns

Index(['SAMPLEID', 'ANSWERDATE', 'SEX', 'AGE', 'AGEID', 'PREFECTURE', 'AREA',
       'MARRIED', 'CHILD', 'HINCOME', 'PINCOME', 'JOB', 'CELL', 'CELLNAME',
       'Q1', 'Q2', 'Q3', 'Q4', 'Q5_1', 'Q5_2', 'Q5_3', 'Q5_4', 'Q6_1', 'Q6_2',
       'Q6_3', 'Q6_4', 'Q6_5', 'Q6_6', 'Q6_7', 'Q7', 'Q8_1', 'Q8_2', 'Q8_3',
       'Q8_4', 'Q8_5', 'Q8_6', 'Q9'],
      dtype='object')

In [6]:
cb_df = codebook_df.copy()

cb_df.replace('', np.nan, inplace=True)

# create new column to indicate question or answer
cb_df['Q/A'] = np.nan
cb_df.loc[~cb_df['Item name'].isna(), 'Q/A'] = 'Q'
cb_df.loc[cb_df['Q/A'] != 'Q', 'Q/A'] = 'A'

# we really only need Item name, label, Choice number, Question / choices, Q/A
cols = ['Item name', 'label', 'Choice number', 'Question / choices', 'Q/A']
cb_df = cb_df[cols]

cb_df['Item name'] = cb_df['Item name'].fillna(method='ffill')

cb_df.loc[~cb_df['Item name'].isna() & cb_df['label'].isna(), 'label'] = cb_df['Item name']

#cb_df[~cb_df['Item name'].isna() & cb_df['label'].isna()]
cb_df

Unnamed: 0,Item name,label,Choice number,Question / choices,Q/A
0,SAMPLEID,SAMPLEID,,回答者ID,Q
1,ANSWERDATE,ANSWERDATE,,回答日時,Q
2,SEX,SEX,,性別,Q
3,SEX,SEX,1.0,男性,A
4,SEX,SEX,2.0,女性,A
5,AGE,AGE,,年齢(才),Q
6,AGEID,AGEID,,年齢,Q
7,AGEID,AGEID,1.0,12才未満,A
8,AGEID,AGEID,2.0,12才～19才,A
9,AGEID,AGEID,3.0,20才～24才,A


In [7]:
questions = cb_df[['label', 'Question / choices']]
questions

Unnamed: 0,label,Question / choices
0,SAMPLEID,回答者ID
1,ANSWERDATE,回答日時
2,SEX,性別
3,SEX,男性
4,SEX,女性
5,AGE,年齢(才)
6,AGEID,年齢
7,AGEID,12才未満
8,AGEID,12才～19才
9,AGEID,20才～24才


Create "Overall" for Awareness

In [8]:
data_df['Q5'] = 0
data_df.loc[(data_df['Q5_1'] == 1) | (data_df['Q5_2'] == 1) | (data_df['Q5_3'] == 1), 'Q5'] = 1

In [9]:
recode_dic = {
    'Q2': [1, 2],
    'Q3': [1, 2],
    'Q4': [1, 2],
    'Q5': [1],
    'Q5_1': [1],
    'Q5_2': [1],
    'Q5_3': [1],
    'Q6_1': [1],
    'Q7': [1, 2],
    'Q8_1': [1],
    'Q8_2': [1],
    'Q8_3': [1],
    'Q8_4': [1],
    'Q8_5': [1],
}

def recode(df):
    for q in recode_dic.keys():
        df[q+'_r'] = 0
        df.loc[df[q].isin(recode_dic[q]), q+'_r'] = 1

In [10]:
overall_df = data_df.copy()

questions_to_recode = recode_dic.keys()

recode(overall_df)

overall_df['group'] = np.nan
overall_df.loc[overall_df['CELL'] <= 8, 'group'] = 'EXP'
overall_df.loc[overall_df['CELL'] >= 9, 'group'] = 'CON'

cuts_dic = {
    'Overall': overall_df
}

In [11]:
male_df = overall_df[overall_df['SEX'] == 1]
female_df = overall_df[overall_df['SEX'] == 2]

age25_34_df = overall_df[(overall_df['AGE'] >= 25) & (overall_df['AGE'] <= 34)]
age35_44_df = overall_df[(overall_df['AGE'] >= 35) & (overall_df['AGE'] <= 44)]
age45_54_df = overall_df[(overall_df['AGE'] >= 45) & (overall_df['AGE'] <= 54)]
age55_64_df = overall_df[(overall_df['AGE'] >= 55) & (overall_df['AGE'] <= 64)]

tokyo_osaka_df = overall_df[(overall_df['PREFECTURE'] == 13) | (overall_df['PREFECTURE'] == 27)]
excl_tokyo_osaka_df = overall_df[(overall_df['PREFECTURE'] != 13) & (overall_df['PREFECTURE'] != 27)]

nobarriers_df = overall_df[overall_df['Q8_6'] == 1]

usageintent_df = overall_df[overall_df['Q7'].isin([1, 2])]

cuts_dic['Male'] = male_df
cuts_dic['Female'] = female_df
cuts_dic['25-34'] = age25_34_df
cuts_dic['35-44'] = age35_44_df
cuts_dic['45-54'] = age45_54_df
cuts_dic['55-64'] = age55_64_df
cuts_dic['Tokyo + Osaka'] = tokyo_osaka_df
cuts_dic['Other Prefectures'] = excl_tokyo_osaka_df
cuts_dic['No Barriers'] = nobarriers_df
cuts_dic['Usage Intent'] = usageintent_df

In [12]:
output = pd.DataFrame()
for cut in cuts_dic.keys():
    df = cuts_dic[cut]

    df = df[[*[q+'_r' for q in questions_to_recode], 'group']]

    con_base = len(df[df['group'] == 'CON'])
    exp_base = len(df[df['group'] == 'EXP'])

    df = df.groupby('group').sum().transpose()
    df['CON_base'] = con_base
    df['EXP_base'] = exp_base

    df['CON_desired_%'] = df['CON']/df['CON_base']
    df['EXP_desired_%'] = df['EXP']/df['EXP_base']

    df['abs_lift_%'] = df['EXP_desired_%'] - df['CON_desired_%']

    df['p-value'] = df.apply(lambda x: ztest(
                                            [x['EXP'], x['CON']],
                                            [x['EXP_base'], x['CON_base']])[1], axis = 1)
    df['Cut'] = cut
    
    if output.empty:
        output = df
    else:
        output = pd.concat([output, df])
print(output)

group   CON  EXP  CON_base  EXP_base  CON_desired_%  EXP_desired_%  \
Q2_r     73   75       362       362       0.201657       0.207182   
Q3_r     58   64       362       362       0.160221       0.176796   
Q4_r     94  100       362       362       0.259669       0.276243   
Q5_r    114  117       362       362       0.314917       0.323204   
Q5_1_r   43   42       362       362       0.118785       0.116022   
Q5_2_r   53   54       362       362       0.146409       0.149171   
Q5_3_r   54   57       362       362       0.149171       0.157459   
Q6_1_r  130  134       362       362       0.359116       0.370166   
Q7_r     61   55       362       362       0.168508       0.151934   
Q8_1_r   19   19       362       362       0.052486       0.052486   
Q8_2_r   13   14       362       362       0.035912       0.038674   
Q8_3_r   26   22       362       362       0.071823       0.060773   
Q8_4_r   21   16       362       362       0.058011       0.044199   
Q8_5_r   36   50    

  zstat = value / std_diff
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [13]:
#output = overall_df.reset_index()
#print(output)
#values = overall_df.columns.to_list()
#values.extend(output.values)

#values


#manager.update_values(spreadsheetId=output_spreadsheetId,
#                     update_range=output_data_range,
#                     values=values)

In [14]:
output.to_csv('output.csv')

In [15]:
nobarriers_df['Q7'].value_counts()

5    153
4     41
Name: Q7, dtype: int64