In [2]:
import pandas as pd
import os
import glob
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category = FutureWarning)
from tqdm import tqdm

def get_path(country, week = 'march'):
    base = f'../../data/03-experiment/{country}/'
    path_tw = base + f'treatment/followers/00-raw/tweets/{week}/'
    rand = f'../../data/02-randomize/{country}/04-stratification/integrate/followers_randomized.parquet'
    baseline = base + 'baseline/00-raw/followers/tweets/'
    agg = base + f'treatment/followers/01-preprocess/'
    agg_base = base + 'baseline/01-preprocess/followers/'
    return path_tw, base, rand, baseline, agg, agg_base


def summ_followers2(df):
    metrics = [col for col in df.columns if 'total_' in col]
    cols = metrics + ['verifiability', 'true']

    df_agg = df[['handle', 'author_id'] + 
        cols].groupby(['handle', 'author_id']).sum().reset_index()

    df_count = df[['handle', 'author_id']].groupby(['author_id']).count()
    df_count.rename({'handle': 'n_posts'}, axis=1, inplace=True)

    df_agg = df_agg.merge(df_count, on=['author_id'], how='left')

    return df_agg

def divide_and_conquer(df_final):
    df_RT = df_final[(df_final['text'].str.contains('RT @', case=True, regex=False)) & 
                 (df_final['total_comments'] == 0)]
    df_RT = df_RT.reset_index(drop=True)

    df_no_rt = df_final[(~df_final['text'].str.contains('RT @', 
                    case=True, regex=False)) | 
                    (df_final['total_comments'] > 0)].reset_index(drop=True)
    
    df_rt_agg = summ_followers2(df_RT).reset_index(drop=True)
    df_no_rt_agg = summ_followers2(df_no_rt).reset_index(drop=True)

    cols = [col for col in df_rt_agg.columns if 'total_' in col] + ['verifiability', 'true', 'n_posts']
    df_rt_agg.rename(columns = 
        {col: col + '_rt' for col in df_rt_agg.columns if col in cols}, 
        inplace=True)

    df_no_rt_agg.rename(columns = 
        {col: col + '_no_rt' for col in df_no_rt_agg.columns if col in cols}, 
        inplace=True)

    df_rt_agg.rename(columns = {'handle': 'username', 
                       'author_id':'follower_id'}, inplace = True)
    df_no_rt_agg.rename(columns = {'handle': 'username', 
                       'author_id':'follower_id'}, inplace = True)
    
    return df_rt_agg, df_no_rt_agg


# Change Country here
country = 'KE'
base1 = pd.read_parquet(f'../../data/04-analysis/{country}/baseline_batch2.parquet')

if country == 'KE':
    n_end = 9
    n_base = 14
else:
    n_end = 7

In [3]:
### Baseline:
# KE
path_tw, base, rand, baseline, agg, agg_base = get_path(country, 'march')

df_final = pd.DataFrame()
for i in tqdm(range(0, n_base)):
    df1 = pd.read_parquet(f'{agg_base}predicted/baseline2_batch2_{i}.parquet.gzip')
    df_final = pd.concat([df_final, df1])

100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:25<00:00,  1.84s/it]


In [4]:
df_final['handle'] = df_final['author_id']

df_RT = df_final[(df_final['text'].str.contains('RT @', case=True, regex=False)) & 
                 (df_final['total_comments'] == 0)]
df_RT = df_RT.reset_index(drop=True)

df_no_rt = df_final[(~df_final['text'].str.contains('RT @', 
                    case=True, regex=False)) | 
                    (df_final['total_comments'] > 0)].reset_index(drop=True)

df_agg = summ_followers2(df_final).reset_index(drop=True)
df_rt_agg = summ_followers2(df_RT).reset_index(drop=True)
df_no_rt_agg = summ_followers2(df_no_rt).reset_index(drop=True)

cols = [col for col in df_rt_agg.columns if 'total_' in col] + ['verifiability', 'true', 'n_posts']
df_agg.rename(columns = 
        {col: col + '_base' for col in df_agg.columns if col in cols}, 
        inplace=True)

df_rt_agg.rename(columns = 
        {col: col + '_rt_base' for col in df_rt_agg.columns if col in cols}, 
        inplace=True)

df_no_rt_agg.rename(columns = 
        {col: col + '_no_rt_base' for col in df_no_rt_agg.columns if col in cols}, 
        inplace=True)

df_agg.rename(columns = {'author_id':'follower_id'}, inplace = True)
df_rt_agg.rename(columns = {'author_id':'follower_id'}, inplace = True)
df_no_rt_agg.rename(columns = {'author_id':'follower_id'}, inplace = True)

df_agg = df_agg.drop(['handle'], axis=1)
df_rt_agg = df_rt_agg.drop(['handle'], axis=1)
df_no_rt_agg = df_no_rt_agg.drop(['handle'], axis=1)

base1 = pd.read_parquet(f'../../data/04-analysis/{country}/baseline_batch2.parquet')

base1 = base1.merge(df_rt_agg, on=['follower_id'], 
                  how='left')

base1 = base1.merge(df_agg, on=['follower_id'], 
                  how='left')

base1 = base1.merge(df_no_rt_agg, on=['follower_id'], 
                  how='left')

cols_base = [col for col in base1.columns if '_base' in col]
for x in cols_base:
    base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
base1.to_parquet(f'../../data/04-analysis/{country}/baseline/baseline_batch2_april.parquet')

In [5]:
base1

Unnamed: 0,username,follower_id,blockid1,blockid2,ads_treatment,id,c_t_strong_total,c_t_weak_total,c_t_neither_total,t_strong,...,total_comments_base,verifiability_base,true_base,n_posts_base,total_shares_no_rt_base,total_reactions_no_rt_base,total_comments_no_rt_base,verifiability_no_rt_base,true_no_rt_base,n_posts_no_rt_base
0,BrianKiprich,1251939178264354820,477.0,840.0,1.0,11111111,0,1,0,0.0,...,1.0,0.0,0.0,3.0,0.0,6.0,1.0,0.0,0.0,3.0
1,ShakuStein,1252137750398197762,442.0,763.0,0.0,11111111,0,1,0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,wiginabush,1252197986551169025,344.0,546.0,0.0,11111111,0,1,0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,Dr_WandaD,1252998418026442757,477.0,840.0,0.0,11111111,0,1,0,0.0,...,0.0,0.0,0.0,3.0,3.0,9.0,0.0,0.0,0.0,2.0
4,Diana_210817,1253082749679939587,467.0,817.0,1.0,11111111,0,1,0,0.0,...,30.0,29.0,10.0,192.0,11.0,115.0,30.0,2.0,0.0,81.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40189,dnlkym,99741477,325.0,504.0,1.0,22222222,0,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40190,alliewanjiru,998212432903462912,522.0,942.0,0.0,22222222,0,1,6,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40191,bawahbbc,998687786143186944,136.0,83.0,0.0,22222222,0,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40192,ck_nelsn,998796884000075776,633.0,1188.0,0.0,22222222,0,2,3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
