In [1]:
import pandas as pd
import os
import glob
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category = FutureWarning)
from tqdm import tqdm

def get_path(country, week = 'march'):
    base = f'../../data/03-experiment/{country}/'
    path_tw = base + f'treatment/followers/00-raw/tweets/{week}/'
    rand = f'../../data/02-randomize/{country}/04-stratification/integrate/followers_randomized.parquet'
    baseline = base + 'baseline/00-raw/followers/tweets/'
    agg = base + f'treatment/followers/01-preprocess/'
    agg_base = base + 'baseline/01-preprocess/followers/'
    return path_tw, base, rand, baseline, agg, agg_base


def summ_followers2(df):
    metrics = [col for col in df.columns if 'total_' in col]
    cols = metrics + ['verifiability', 'true']

    df_agg = df[['handle', 'author_id'] + 
        cols].groupby(['handle', 'author_id']).sum().reset_index()

    df_count = df[['handle', 'author_id']].groupby(['author_id']).count()
    df_count.rename({'handle': 'n_posts'}, axis=1, inplace=True)

    df_agg = df_agg.merge(df_count, on=['author_id'], how='left')

    return df_agg

def divide_and_conquer(df_final):
    df_RT = df_final[(df_final['text'].str.contains('RT @', case=True, regex=False)) & 
                 (df_final['total_comments'] == 0)]
    df_RT = df_RT.reset_index(drop=True)

    df_no_rt = df_final[(~df_final['text'].str.contains('RT @', 
                    case=True, regex=False)) | 
                    (df_final['total_comments'] > 0)].reset_index(drop=True)
    
    df_rt_agg = summ_followers2(df_RT).reset_index(drop=True)
    df_no_rt_agg = summ_followers2(df_no_rt).reset_index(drop=True)

    cols = [col for col in df_rt_agg.columns if 'total_' in col] + ['verifiability', 'true', 'n_posts']
    df_rt_agg.rename(columns = 
        {col: col + '_rt' for col in df_rt_agg.columns if col in cols}, 
        inplace=True)

    df_no_rt_agg.rename(columns = 
        {col: col + '_no_rt' for col in df_no_rt_agg.columns if col in cols}, 
        inplace=True)

    df_rt_agg.rename(columns = {'handle': 'username', 
                       'author_id':'follower_id'}, inplace = True)
    df_no_rt_agg.rename(columns = {'handle': 'username', 
                       'author_id':'follower_id'}, inplace = True)
    
    return df_rt_agg, df_no_rt_agg


NameError: name 'base' is not defined

In [23]:
country = 'KE'
base = pd.read_parquet(f'../../data/04-analysis/{country}/baseline_rt.parquet')
base.rename(columns = {'shares_base': 'total_shares_base', 
                       'reactions_base':'total_reactions_base',
                       'comments_base':'total_comments_base'}, inplace = True)

if country == 'SA':
    N_ARCHS = 25
else:
    N_ARCHS = 58

## Agreggate 1st month: 

dates:
start =  '2023-03-12T00:00:00Z'
end = '2023-04-09T00:00:00Z'

So:

week1-2 (stage1) = '2023-03-13T00:00:00Z' to '2023-03-27T00:00:00Z'
week3-4 (stage2) = '2023-03-27T00:00:00Z' to '2023-04-09T00:00:00Z'

In [3]:
# Read the files
path_tw, base, rand, baseline, agg, agg_base = get_path(country, 'march')

df_final = pd.DataFrame()
for i in range(0, N_ARCHS):
    df = pd.read_parquet(f'{agg}predicted/march_{i}.parquet.gzip')
    df_final = pd.concat([df_final, df])

df_final = df_final.reset_index(drop=True)
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10632882 entries, 0 to 10632881
Data columns (total 18 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   id                          object 
 1   handle                      object 
 2   author_id                   object 
 3   created_at                  object 
 4   text                        object 
 5   lang                        object 
 6   referenced_tweets           object 
 7   public_metrics.like_count   int64  
 8   public_metrics.quote_count  int64  
 9   public_metrics.reply_count  int64  
 10  total_shares                int64  
 11  total_reactions             int64  
 12  total_comments              int64  
 13  has_url                     int32  
 14  has_image                   int32  
 15  has_text                    int32  
 16  verifiability               float64
 17  true                        float64
dtypes: float64(2), int32(3), int64(6), object(7)
memory usage: 1

In [17]:
# Only run for KE
#df_omu = pd.read_parquet(f'{base}treatment/followers/00-raw/tweets/march/omurung2.parquet')
#df_omu = df_omu[['id', 'public_metrics.retweet_count']]
#df_omu.rename(columns = {'public_metrics.retweet_count': 'total_shares'}, inplace = True)

df_check = df_final[df_final['handle'] == 'omurung2']
df_check = df_check.drop(['total_reactions', 'total_shares'], axis=1)
df_check = df_check.merge(df_omu, on='id', how='left')
df_check['total_reactions'] = (df_check['public_metrics.like_count'] + 
                               df_check['public_metrics.quote_count'] + 
                               df_check['total_shares'])

df_final1 = df_final[df_final['handle'] != 'omurung2']
df_final = pd.concat([df_final1, df_check]).reset_index(drop=True)

In [20]:
df_final['stage'] = np.where((df_final['created_at'] > '2023-03-13') & (df_final['created_at'] < '2023-03-27'), 
                             1, 2)
metrics = [col for col in df_final.columns if 'total_' in col]
cols = metrics + ['verifiability', 'true']

In [21]:
stage1 = df_final[df_final['stage'] == 1]
stage1_agg = summ_followers2(stage1)
stage1_agg.rename(columns = {'handle':'username', 
                             'author_id':'follower_id'}, inplace = True)
df_rt_agg, df_no_rt_agg = divide_and_conquer(stage1)

In [24]:
base = base.merge(df_rt_agg, on=['follower_id', 'username'], 
                  how='left')

base = base.merge(df_no_rt_agg, on=['follower_id', 'username'], 
                  how='left')

base = base.merge(stage1_agg, on=['follower_id', 'username'], 
                  how='left')

In [25]:
base.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102534 entries, 0 to 102533
Data columns (total 48 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   username                    102534 non-null  object 
 1   follower_id                 102534 non-null  object 
 2   ads_treatment               102534 non-null  float64
 3   strat_block1                102534 non-null  object 
 4   strat_block2                102534 non-null  object 
 5   c_t_strong_total            102534 non-null  int32  
 6   c_t_weak_total              102534 non-null  int32  
 7   c_t_neither_total           102534 non-null  int32  
 8   t_strong                    102534 non-null  float64
 9   t_weak                      102534 non-null  float64
 10  t_neither                   102534 non-null  float64
 11  total_shares_base           102534 non-null  float64
 12  total_reactions_base        102534 non-null  float64
 13  total_comments

In [26]:
cols_base = ([col for col in base.columns if '_rt' in col] + 
             [col for col in base.columns if '_no_rt' in col] + 
             cols + ['n_posts'])

for x in cols_base:
    base[x] = np.where(base[x].isnull(), 0, base[x])

In [27]:
base.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102534 entries, 0 to 102533
Data columns (total 48 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   username                    102534 non-null  object 
 1   follower_id                 102534 non-null  object 
 2   ads_treatment               102534 non-null  float64
 3   strat_block1                102534 non-null  object 
 4   strat_block2                102534 non-null  object 
 5   c_t_strong_total            102534 non-null  int32  
 6   c_t_weak_total              102534 non-null  int32  
 7   c_t_neither_total           102534 non-null  int32  
 8   t_strong                    102534 non-null  float64
 9   t_weak                      102534 non-null  float64
 10  t_neither                   102534 non-null  float64
 11  total_shares_base           102534 non-null  float64
 12  total_reactions_base        102534 non-null  float64
 13  total_comments

In [29]:
base.to_parquet(f'{agg}aggregated/stage1_rt.parquet', 
                index=False)

In [30]:
base = pd.read_parquet(f'../../data/04-analysis/{country}/baseline_rt.parquet')
base.rename(columns = {'shares_base': 'total_shares_base', 
                       'reactions_base':'total_reactions_base',
                       'comments_base':'total_comments_base'}, inplace = True)

In [31]:
# Stage 2
stage2 = df_final[df_final['stage'] == 2]
stage2_agg = summ_followers2(stage2)
stage2_agg.rename(columns = {'handle':'username', 
                             'author_id':'follower_id'}, inplace = True)
df_rt_agg, df_no_rt_agg = divide_and_conquer(stage2)

In [32]:
base = base.merge(df_rt_agg, on=['follower_id', 'username'], 
                  how='left')

base = base.merge(df_no_rt_agg, on=['follower_id', 'username'], 
                  how='left')

base = base.merge(stage2_agg, on=['follower_id', 'username'], 
                  how='left')

In [33]:
for x in cols_base:
    base[x] = np.where(base[x].isnull(), 0, base[x])

In [35]:
base.to_parquet(f'{agg}aggregated/stage2_rt.parquet', 
                index=False)