# Goal
1. Implement the data described in this [google doc](https://docs.google.com/document/d/1plhoDbQryYQ32vZMXu8YmlLSp30QTdup43k6uTePOT4/edit#heading=h.b2ilq31no707).
1. One thankee per row, everyone who was ever inputted into the thanker app. Items that were experiment id -3
    1. We need all the actions connected to thos (where they are the object, maybe in metadatajson)
        1. Error status of those
1. Survey results from julia
1. Then need replica data on behaviour (with caching probably).

## Note
1. the thanker final data collector was written in the edit-sync repo as part of onboard_thankers. I'm switching it up because this is closer to the goal of making a dataCollector module for civilservant2.0

### connections
1. connecting to aws studies mysql on `3311`
    1. `ssh -N studies.cs 3311:localhost:3306`
2. connecting to wmf repliacs on `3310`
    1. `ssh -N maximilianklein@tools-login.wmflabs.org -L 3310:enwiki.analytics.db.svc.eqiad.wmflabs:3306`


In [1]:
from civilservant.models.core import ExperimentThing, ExperimentAction
from thanks.utils import _get_experiment_id
from civilservant.util import read_config_file
import os
from civilservant.db import init_session, init_engine
from sqlalchemy.dialects import mysql
import pandas as pd
import datetime
import uuid
CACHEDIR='/data/project/cache'
TRESORDIR='/home/paprika/Tresors/CivilServant/projects/wikipedia-integration/gratitude-study/'

In [2]:
df = pd.read_pickle(os.path.join(CACHEDIR, 'thankee-misc', 'survey_and_thankee_actions.pickle'))
acct_map = pd.read_pickle(os.path.join(CACHEDIR, 'thankee-misc', 'acct_map.pickle'))

## assign behavioral_obs_start_dt 
1. based on first_thank_dt or that of block partner

In [3]:
positive_block_id = df['randomization_block_id']>=0
non_null_thank_dt = pd.notnull(df['first_thank_dt'])

block_thank_rel = df[(positive_block_id) & (non_null_thank_dt)][['randomization_block_id', 'first_thank_dt']]

In [4]:
block_first_thank = block_thank_rel.set_index('randomization_block_id').to_dict()['first_thank_dt']

In [5]:
def get_behavior_start_dt(row):
    # if we have a thank date use that
    if pd.notnull(row.loc['first_thank_dt']):
        return row.loc['first_thank_dt']
    # otherwise do they have a block partner with a first thank date
    else:
        try:
            return block_first_thank[row.loc['randomization_block_id']]
        # fallback onto the ET created date
        except KeyError:
            return row.loc['created_dt']

df['behavior_start_dt'] = df.apply(lambda row: get_behavior_start_dt(row), axis=1)

In [6]:
assert any(pd.isnull(df['behavior_start_dt'])) == False

## Getting external data

1. labor.hours.per.day.diff 
    1. participants$labor.hours.per.day.post.treatment -participants$labor.hours.per.day.pre.treatment
    2. 42 days before and after the thank
2. two.week.retention
    1. Whether an account made an edit any-namespace between day 8 and 42 after they received a thank
3. thanks.sent
    1. count variable indicating the number of thanks sent by this account to other Wikipedians in the 42 day period after receiving the intervention.
4. registration date
    1. account age at created_dt
    1. account age at first_thank_dt


In [7]:
from civilservant.wikipedia.queries.revisions import get_timestamps_within_range
from civilservant.wikipedia.queries.user_interactions import get_thanks_sending
from civilservant.wikipedia.queries.users import get_user_basic_data, get_user_disablemail_property
from civilservant.wikipedia.utils import make_cached_df, make_sessions, calc_labour_hours,\
                                            to_wmftimestamp, from_wmftimestamp, bin_from_td
from civilservant.wikipedia.connections.database import make_wmf_con

wmf_con = make_wmf_con()

LABOR_HOURS_OBS_WINDOW = datetime.timedelta(days=42)

@make_cached_df('grat-thankee-timestamps')
def get_user_edits_before_and_after_obs(lang, user_name, thank_date):
    start_date = thank_date - LABOR_HOURS_OBS_WINDOW
    end_date = thank_date + LABOR_HOURS_OBS_WINDOW
    ts = get_timestamps_within_range(lang=lang, start_date=start_date, end_date=end_date, user_name=user_name,
                         con=wmf_con)
    return ts
    

In [8]:
# delete this for full run
# df = df[:1000]
print(f'data frem length: {len(df)}')

data frem length: 15958


In [9]:
df['labor_hours_ts_df'] = df.apply(lambda row: get_user_edits_before_and_after_obs(row['lang'],
                                                                                  row['user_name'],
                                                                                  row['behavior_start_dt'])
                                                                                   , axis=1)

In [10]:
def num_labor_hours(before_after, behavior_start_dt, ts_df):
    start_dt = behavior_start_dt - LABOR_HOURS_OBS_WINDOW if before_after=='before' else behavior_start_dt
    end_dt = behavior_start_dt if before_after=='before' else behavior_start_dt + LABOR_HOURS_OBS_WINDOW
    
    window_ts_df =  ts_df[(ts_df['rev_timestamp'] > start_dt)  & (ts_df['rev_timestamp'] <= end_dt)]
    if len(window_ts_df)==0:
        return 0
    else:
        window_dts = [pd.to_datetime(np_dt) for np_dt in window_ts_df['rev_timestamp'].values]
        window_labor_hours = calc_labour_hours(window_dts)
        return window_labor_hours

In [11]:
df['labor_hours_pre_treatment'] = df.apply(lambda row: num_labor_hours('before', 
                                                                row['behavior_start_dt'],
                                                                row['labor_hours_ts_df']),
                                    axis=1)
df['labor_hours_post_treatment'] = df.apply(lambda row: num_labor_hours('after', 
                                                                row['behavior_start_dt'],
                                                                row['labor_hours_ts_df']),
                                    axis=1)

df['labor_hours_per_day_pre_treatment'] = df['labor_hours_pre_treatment'] / LABOR_HOURS_OBS_WINDOW.days
df['labor_hours_per_day_post_treatment'] = df['labor_hours_post_treatment'] / LABOR_HOURS_OBS_WINDOW.days
df['labor_hours_per_day_diff'] = df['labor_hours_per_day_post_treatment'] - df['labor_hours_per_day_pre_treatment']

In [12]:
df['labor_hours_per_day_diff'].mean()

-0.006875556188831662

In [13]:
def two_week_retention(behavior_start_dt, ts_df):
    start_dt = behavior_start_dt + datetime.timedelta(days=7)
    end_dt = behavior_start_dt + LABOR_HOURS_OBS_WINDOW
    
    window_ts_df =  ts_df[(ts_df['rev_timestamp'] > start_dt)  & (ts_df['rev_timestamp'] <= end_dt)]
    return True if len(window_ts_df)>0 else False

In [14]:
df['two_week_retention'] = df.apply(lambda row: two_week_retention(row['behavior_start_dt'],
                                                                   row['labor_hours_ts_df']),
                                    axis=1)

In [15]:
df['two_week_retention'].mean()

0.2507206416844216

In [16]:
@make_cached_df('grat-thankees-thanks-sent')
def get_subsequent_thanks_sent(lang, user_name, behavior_start_dt):
    start_dt = behavior_start_dt
    end_dt = behavior_start_dt + LABOR_HOURS_OBS_WINDOW
    thanks_sent = get_thanks_sending(lang, user_name, start_dt, end_dt, wmf_con)
    return thanks_sent

In [17]:
df['thanks_sent_df'] = df.apply(lambda row: get_subsequent_thanks_sent(row['lang'],
                                                                    row['user_name'],
                                                                    row['behavior_start_dt'],                                                                   ),
                                    axis=1)
df['thanks_sent'] = df['thanks_sent_df'].apply(len)

In [18]:
df['thanks_sent'].mean()

0.16299034966787818

In [19]:
@make_cached_df('grat-thankee-user-basic-data')
def get_user_basic(lang, user_name):
    return get_user_basic_data(lang=lang, user_name=user_name, wmf_con=wmf_con)

def user_registration_dt_from_basic(user_basic_df):
    return user_basic_df['user_registration'].iloc[0] if len(user_basic_df)>0 else float('nan')

def user_id_from_basic(user_basic_df):
    return user_basic_df['user_id'].iloc[0] if len(user_basic_df)>0 else float('nan')

def account_age_at_assignment(created_dt, registration_dt):
    return bin_from_td(created_dt-registration_dt) if pd.notnull(registration_dt) else registration_dt

def account_age_at_treatment(behavior_start_dt, registration_dt):
    return bin_from_td(behavior_start_dt-registration_dt) if pd.notnull(registration_dt) else registration_dt

def year(registration_dt):
    return registration_dt.year if pd.notnull(registration_dt) else registration_dt

In [20]:
df['user_basic_data'] = df.apply(lambda row: get_user_basic(row['lang'], row['user_name']), axis=1)

df['user_registration_dt'] = df['user_basic_data'].apply(user_registration_dt_from_basic)
df['user_id'] = df['user_basic_data'].apply(user_id_from_basic)

df['prev_experience_assignment'] = df.apply(lambda row: account_age_at_assignment(row['created_dt'], row['user_registration_dt']) ,axis=1)
df['prev_experience_treatment'] = df.apply(lambda row: account_age_at_assignment(row['behavior_start_dt'], row['user_registration_dt']) ,axis=1)

df['year'] = df['user_registration_dt'].apply(year)

In [21]:
@make_cached_df('grat-thankee-has-email')
def get_has_disablemail_df(lang, user_id):
    disablemail_df = get_user_disablemail_property(lang, user_id, wmf_con=wmf_con)
    return disablemail_df # True if they havent disabled, otherwise they have disabled and dont get email
    
def get_has_email(lang, user_id):
    if pd.isnull(user_id):
        return user_id
    else:
        disablemail_df = get_has_disablemail_df(lang, user_id)
        return True if len(disablemail_df)==0 else False

In [22]:
df['has_email'] = df.apply(lambda row: get_has_email(row['lang'], row['user_id']), axis=1)

# Compliance - app
1. noncompliant if don't have a user_registration_date
2. block partners of removed users

In [23]:
df.shape

(15958, 42)

In [24]:
df['thanks_not_received_skipped']  = (pd.isnull(df['first_thank_dt'])) & (df['num_skips'] > 0) 
df['thanks_not_received_not_seen'] = (pd.isnull(df['first_thank_dt'])) & (df['num_skips'] == 0) & (df['num_errors']==0)
df['thanks_not_received_error']  = (pd.isnull(df['first_thank_dt'])) & (df['num_errors'] > 0)
df['thanks_not_received_user_deleted'] = df['user_basic_data'].apply(len) == 0
df['received_multiple_thanks'] = df['num_thanks'] > 1

In [25]:
df['complier_app_any_reason'] = ~(df['thanks_not_received_skipped'] | df['thanks_not_received_not_seen'] | df['thanks_not_received_error'] | df['thanks_not_received_user_deleted'] | df['received_multiple_thanks'] )

# Compliance Survey

In [26]:
df['complier'] = pd.notnull(df['wikipedians.value.contributions']) & pd.notnull(df['community.friendly'])

# Output 

In [27]:
OUTPUT_COLS = [
'private.anonymous.id',
'randomization.block.id',
'labor.hours.per.day.diff', 
'two.week.retention',
'thanks.sent',
'wikipedians.value.contributions',
'community.friendly',
'complier',
'lang',
'prev.experience.assignment',
'prev.experience.treatment',
'year',
'has.email',
'remembered.thanks',
'overall.exp',    
'social.value.1',    
'social.value.3',    
'social.value.4',    
'social.warmth.2',    
'social.warmth.3',    
'randomization.arm',
'number.thanks.received',
'number.skips.received',
'thanks.not.received.skipped',
'thanks.not.received.not.seen',
'thanks.not.received.error',
'thanks.not.received.user.deleted',
'received.multiple.thanks', 
'complier.app.any.reason', 
]

In [28]:
df.columns

Index(['created_dt', 'randomization_arm', 'randomization_condition',
       'removed_dt', 'metadata_json', 'lang', 'user_name',
       'randomization_block_id', 'num_errors', 'num_skips', 'first_thank_dt',
       'num_thanks', 'num_messages', 'consent', 'overall.exp',
       'social.value.1', 'wikipedians.value.contributions', 'social.value.3',
       'social.value.4', 'community.friendly', 'social.warmth.2',
       'social.warmth.3', 'community', 'remembered.thanks',
       'private_anonymous_id', 'behavior_start_dt', 'labor_hours_ts_df',
       'labor_hours_pre_treatment', 'labor_hours_post_treatment',
       'labor_hours_per_day_pre_treatment',
       'labor_hours_per_day_post_treatment', 'labor_hours_per_day_diff',
       'two_week_retention', 'thanks_sent_df', 'thanks_sent',
       'user_basic_data', 'user_registration_dt', 'user_id',
       'prev_experience_assignment', 'prev_experience_treatment', 'year',
       'has_email', 'thanks_not_received_skipped',
       'thanks_not_rece

In [29]:
r_col_names = [cname.replace('_','.') for cname in df.columns]
df.columns = r_col_names

In [30]:
col_rename = {'num.skips':'number.skips.received',
             'num.thanks':'number.thanks.received'}
df = df.rename(columns=col_rename)

In [31]:
output_col_present = [oc in df.columns for oc in OUTPUT_COLS]
all(output_col_present)

True

In [32]:
list(zip(OUTPUT_COLS, output_col_present))

[('private.anonymous.id', True),
 ('randomization.block.id', True),
 ('labor.hours.per.day.diff', True),
 ('two.week.retention', True),
 ('thanks.sent', True),
 ('wikipedians.value.contributions', True),
 ('community.friendly', True),
 ('complier', True),
 ('lang', True),
 ('prev.experience.assignment', True),
 ('prev.experience.treatment', True),
 ('year', True),
 ('has.email', True),
 ('remembered.thanks', True),
 ('overall.exp', True),
 ('social.value.1', True),
 ('social.value.3', True),
 ('social.value.4', True),
 ('social.warmth.2', True),
 ('social.warmth.3', True),
 ('randomization.arm', True),
 ('number.thanks.received', True),
 ('number.skips.received', True),
 ('thanks.not.received.skipped', True),
 ('thanks.not.received.not.seen', True),
 ('thanks.not.received.error', True),
 ('thanks.not.received.user.deleted', True),
 ('received.multiple.thanks', True),
 ('complier.app.any.reason', True)]

In [35]:
!ls $TRESORDIR

'Data Drills'	'research materials'	    thanks_love_counts_2017
 datasets	 thankable_revisions_task   thanks-love-records-07.2018
'gdpr notices'	 thanker_surveys
 report-drafts	 thanking_paper_prototype


In [38]:
out_dir = 'Data Drills/thankee/post_experiment_analysis'
out_fname = 'grat-thankee-all-pre-post-treatment-vars.csv'
acct_map.to_csv(os.path.join(TRESORDIR, out_dir,'acct_map.csv'), index=False)
df[OUTPUT_COLS].to_csv(os.path.join(TRESORDIR, out_dir, out_fname), index=False)