# Goal
1. Get the experimentthings and actions related to the experiment ID
2. add activation rate variables
    1. simply, or think about batched/ cached way
3. later: compute statistical difference

### connections
1. connecting to gratsync mysql on 3311
2. connecting to wmf repliacs on 3310

In [1]:
from civilservant.models.core import ExperimentThing, ExperimentAction
from thanks.utils import _get_experiment_id
from civilservant.util import read_config_file
import os
from civilservant.db import init_session, init_engine
from sqlalchemy.dialects import mysql
import pandas as pd
import datetime

In [2]:
db = init_session()
con = init_engine()

In [3]:
db.execute('show databases;').fetchall()

[('civilservant_general_production',), ('information_schema',)]

In [4]:
con

Engine(mysql://civilservant:***@127.0.0.1:3311/civilservant_general_production?charset=utf8mb4)

In [5]:
cnf = read_config_file(os.environ['CS_EXTRA_CONFIG_FILE'], os.path.abspath(''))

In [6]:
experiment_id = _get_experiment_id(db, cnf['name'], return_id=True)

In [7]:
experiment_id

-15

In [51]:
ETs_q = db.query(ExperimentThing.id,
                 ExperimentThing.thing_id,
                 ExperimentThing.created_dt,
                 ExperimentThing.randomization_arm,
                 ExperimentThing.metadata_json['randomization_block_id']). \
                    filter_by(experiment_id=experiment_id)

In [52]:
ETs_sql = ETs_q.statement.compile(dialect=mysql.dialect())

In [55]:
ET_df = pd.read_sql(ETs_sql, params={"experiment_id":experiment_id}, con=con)
ET_df = ET_df.rename(columns={'anon_1':'randomization_block_id'})

In [56]:
ET_df

Unnamed: 0,id,thing_id,created_dt,randomization_arm,randomization_block_id
0,user:fr:-Atlantique85-,22651,2020-02-22 18:56:08,2,2160
1,user:fr:-pi.tor-reip,24139,2020-02-24 13:30:10,1,2291
2,user:fr:-Urmbels-,19582,2020-02-19 14:38:09,2,1870
3,"user:fr:, kj uitk,uitkui",24091,2020-02-24 12:38:08,0,2286
4,"user:fr:,jg, ghj,nutjhn",806,2020-01-31 16:14:06,2,76
...,...,...,...,...,...
19405,user:fr:担,22205,2020-02-22 10:02:09,2,2116
19406,user:fr:杨肥羊爱肥肠,17619,2020-02-17 15:32:07,0,1686
19407,user:fr:王哪里去,7222,2020-02-06 21:58:06,1,704
19408,user:fr:賀課,8257,2020-02-08 02:06:06,2,800


In [57]:
EAs_q = db.query(ExperimentAction.action_object_id,
                 ExperimentAction.created_dt,
                 ExperimentAction.metadata_json). \
                    filter_by(experiment_id=experiment_id,
                              action='talk_page_message')

In [58]:
EAs_sql = EAs_q.statement.compile(dialect=mysql.dialect())

In [59]:
EA_df = pd.read_sql(EAs_sql, params={"experiment_id":experiment_id}, con=con)

In [60]:
EA_df.iloc[0]['metadata_json']

{'lang': 'fr',
 'errors': [],
 'signer': 'Myloufa',
 'user_name': 'Aie aie prime',
 'action_complete': True,
 'action_response': 'control_action_is_skip',
 'randomization_arm': 0,
 'randomization_arm_obfuscated': '-'}

In [61]:
EA_df.dtypes

action_object_id            object
created_dt          datetime64[ns]
metadata_json               object
dtype: object

In [62]:
CAT_q = db.query(ExperimentAction.metadata_json). \
                    filter_by(experiment_id=experiment_id,
                              action='page_text_check')
CAT_sql = CAT_q.statement.compile(dialect=mysql.dialect())
CAT_df = pd.read_sql(CAT_sql, params={"experiment_id":experiment_id}, con=con)

In [63]:
def key_or_nan(d, key):
    try:
        return d[key]
    except KeyError:
        return float('nan')

In [64]:
EA_df['action_complete'] = EA_df['metadata_json'].apply(lambda d: key_or_nan(d, 'action_complete'))
EA_df['action_response'] = EA_df['metadata_json'].apply(lambda d: key_or_nan(d, 'action_response'))

In [65]:
EA_df['user_name'] = EA_df['metadata_json'].apply(lambda d: key_or_nan(d, 'user_name'))
CAT_df['user_name'] = CAT_df['metadata_json'].apply(lambda d: key_or_nan(d, 'user_name'))

In [66]:
CAT_df['action_complete'] = EA_df['metadata_json'].apply(lambda d: key_or_nan(d, 'action_complete'))
EA_df['user_name'] = EA_df['metadata_json'].apply(lambda d: key_or_nan(d, 'user_name'))

In [67]:
df = ET_df.merge(how='left', right=EA_df, left_on='thing_id', right_on='action_object_id', suffixes=('_et','_ea') )
df = df.merge(how='left', right=CAT_df, left_on='user_name', right_on='user_name', suffixes=('','_no_control_accident'))

In [68]:
df.dtypes

id                                             object
thing_id                                       object
created_dt_et                          datetime64[ns]
randomization_arm                               int64
randomization_block_id                          int64
action_object_id                               object
created_dt_ea                          datetime64[ns]
metadata_json                                  object
action_complete                                  bool
action_response                                object
user_name                                      object
metadata_json_no_control_accident              object
action_complete_no_control_accident            object
dtype: object

In [69]:
df.head(2)

Unnamed: 0,id,thing_id,created_dt_et,randomization_arm,randomization_block_id,action_object_id,created_dt_ea,metadata_json,action_complete,action_response,user_name,metadata_json_no_control_accident,action_complete_no_control_accident
0,user:fr:-Atlantique85-,22651,2020-02-22 18:56:08,2,2160,22651,2020-02-22 18:56:08,"{'lang': 'fr', 'errors': [], 'signer': 'Braaar...",True,"{'edit': {'new': '', 'title': 'Discussion util...",-Atlantique85-,,
1,user:fr:-pi.tor-reip,24139,2020-02-24 13:30:10,1,2291,24139,2020-02-24 13:30:10,"{'lang': 'fr', 'errors': [], 'signer': 'Erdrok...",True,"{'edit': {'new': '', 'title': 'Discussion util...",-pi.tor-reip,,


## Getting external data

In [70]:
from civilservant.wikipedia.queries.revisions import get_timestamps_within_range

from civilservant.wikipedia.utils import make_cached_df

from civilservant.wikipedia.connections.database import make_wmf_con
import datetime

wmf_con = make_wmf_con()

@make_cached_df('fr-user-edits-stop-rule')
def fr_user_edits(user_name, start_date):
    end_date = start_date + datetime.timedelta(days=7)
    return get_timestamps_within_range(lang='fr', start_date=start_date, end_date=end_date, user_name=user_name,
                         con=wmf_con)

In [75]:
# df_sm = df[:1000]
# df_sm['timestamps']=df_sm.apply(lambda row: fr_user_edits(row['user_name'], row['created_dt_ea']), axis=1)

In [76]:
print(len(df))
df = df[df['created_dt_et']<=datetime.datetime(2020, 2, 26)] # march 4 minus 7 days
print(len(df))

19412
14625


In [77]:
df['timestamps']=df.apply(lambda row: fr_user_edits(row['user_name'], row['created_dt_ea']), axis=1)

## Analysis
1. reduce df to just those users who registered >7 days ago from march 4th
1. that should be about 14-15k users
1. calculate their 7-day activation
1. group-by randomization arm and take the mean of the 7 day activation rate
1. inspect the 7dar between arm 0 and arm 1
1. conduct ttest on that
1. if the effect is above 2.5% may want to remove mistreated blocks.

In [85]:
s = set()

In [87]:
s.update([1,2])

In [110]:
bad_blocks = set()
#case 1 randomization_arm==0 & action_complete_no_control_accident==False
control_mistreated = df[((df['action_complete_no_control_accident']==False) & (df['randomization_arm']==0))]
control_mistreated_block_ids = control_mistreated['randomization_block_id'].values
bad_blocks.update(control_mistreated_block_ids)
#case 2 randomization_arm!=0 & action_complete==False
treatment_mistreated = df[((df['action_complete']==False) & (df['randomization_arm']!=0))]
treatment_mistreated_block_ids = treatment_mistreated['randomization_block_id'].values
bad_blocks.update(treatment_mistreated_block_ids)
print(f'There are {len(bad_blocks)} bad blocks coming from {len(control_mistreated_block_ids)} control mistreated users, and {len(treatment_mistreated_block_ids)} treatment mistreated')

There are 0 bad blocks coming from 0 control mistreated users, and 0 treatment mistreated


In [111]:
orig_df_len = len(df)
df = df[df['randomization_block_id'].apply(lambda bi: bi not in bad_blocks)]
print(f'After the removing bad blocks we go from {orig_df_len} to {len(df)} users')

After the removing bad blocks we go from 13725 to 13725 users


In [112]:
def seven_day_revs(timestamps, created_dt):
    seven_days_after_reg = created_dt + datetime.timedelta(days=7)
    return timestamps[timestamps['rev_timestamp']<seven_days_after_reg]

def seven_day_activated(timestamps, created_dt):
    revs = seven_day_revs(timestamps, created_dt)
    return len(revs) > 0

def seven_day_count(timestamps, created_dt):
    revs = seven_day_revs(timestamps, created_dt)
    return len(revs)

In [113]:
df['seven_day_activated'] = df.apply(lambda row: seven_day_activated(row['timestamps'], row['created_dt_et']) , axis=1)
df['seven_day_rev_count'] = df.apply(lambda row: seven_day_count(row['timestamps'], row['created_dt_et']) , axis=1)

In [114]:
df['seven_day_activated'].describe()

count     13725
unique        2
top       False
freq       8975
Name: seven_day_activated, dtype: object

In [115]:
# activation_rates = df .groupby('randomization_arm').agg({'seven_day_activated':[pd.np.mean, len], 'seven_day_rev_count':[pd.np.mean]})
activation_rates = df .groupby('randomization_arm').agg({'seven_day_activated':[pd.np.mean, len]})

In [116]:
print(activation_rates.to_string())

                  seven_day_activated      
                                 mean   len
randomization_arm                          
0                            0.350601  4575
1                            0.343607  4575
2                            0.344044  4575


In [100]:
from scipy.stats import ttest_ind

In [101]:
treat_0_activated = df[df['randomization_arm']==0]['seven_day_activated']
treat_0_rev_count = df[df['randomization_arm']==0]['seven_day_rev_count']
treat_1_activated = df[df['randomization_arm']==1]['seven_day_activated']
treat_1_rev_count = df[df['randomization_arm']==1]['seven_day_rev_count']

In [102]:
print(ttest_ind(treat_0_activated, treat_1_activated))

Ttest_indResult(statistic=0.7026708219897932, pvalue=0.4822787948988857)


In [103]:
print(ttest_ind(treat_0_rev_count, treat_1_rev_count))

Ttest_indResult(statistic=1.1724398901228752, pvalue=0.24105102238117437)
