In [2]:
import pandas as pd
import os
import glob
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category = FutureWarning)
from tqdm import tqdm

def get_path(country, week = 'march'):
    base = f'../../data/03-experiment/{country}/'
    path_tw = base + f'treatment/followers/00-raw/tweets/{week}/'
    rand = f'../../data/02-randomize/{country}/04-stratification/integrate/followers_randomized.parquet'
    baseline = base + 'baseline/00-raw/followers/tweets/'
    agg = base + f'treatment/followers/01-preprocess/'
    agg_base = base + 'baseline/01-preprocess/followers/'
    return path_tw, base, rand, baseline, agg, agg_base

def summ_followers(df):
    metrics = [col for col in df.columns if 'total_' in col]
    cols = metrics + ['verifiability', 'true']

    df_agg = df[['handle', 'author_id'] + 
        cols].groupby(['handle', 'author_id']).sum().reset_index()

    df_mean = df[['handle', 'author_id'] + 
        cols].groupby(['handle', 'author_id']).mean().reset_index()
    df_mean.rename(columns = 
        {col: col + '_mean' for col in df_mean.columns if col in cols}, 
        inplace=True)

    df_count = df[['handle', 'author_id']].groupby(['author_id']).count()
    df_count.rename({'handle': 'n_posts'}, axis=1, inplace=True)

    df_agg = df_agg.merge(df_mean, on=['handle', 'author_id'], how='left')
    df_agg = df_agg.merge(df_count, on=['author_id'], how='left')

    return df_agg


def summ_followers2(df):
    metrics = [col for col in df.columns if 'total_' in col]
    cols = metrics + ['verifiability', 'true']

    df_agg = df[['handle', 'author_id'] + 
        cols].groupby(['handle', 'author_id']).sum().reset_index()

    df_count = df[['handle', 'author_id']].groupby(['author_id']).count()
    df_count.rename({'handle': 'n_posts'}, axis=1, inplace=True)

    df_agg = df_agg.merge(df_count, on=['author_id'], how='left')

    return df_agg


In [3]:
# KENYA BASELINE:
path_tw, base, rand, baseline, agg, agg_base = get_path('KE', 'march')

df_final = pd.DataFrame()
for i in range(0, 84):
    df = pd.read_parquet(f'{agg_base}predicted/baseline_{i}.parquet.gzip')
    df_agg = summ_followers(df)
    df_final = pd.concat([df_final, df_agg])

df_final = df_final.drop_duplicates(['handle']).reset_index(drop=True)


In [None]:
df_final['true'] = np.where(df_final['verifiability'] == 0, np.nan, df_final['true'])
df_final.to_parquet(f'{agg_base}aggregated/baseline.parquet.gzip',
        compression='gzip', index=False)

In [2]:
# SA BASELINE:
path_tw, base, rand, baseline, agg, agg_base = get_path('SA', 'march')

df_final = pd.DataFrame()
for i in tqdm(range(0, 74)):
    df = pd.read_parquet(f'{agg_base}predicted/baseline_{i}.parquet.gzip')
    df_agg = summ_followers(df)
    df_final = pd.concat([df_final, df_agg])

df_final = df_final.drop_duplicates(['handle']).reset_index(drop=True)
df_final['true'] = np.where(df_final['verifiability'] == 0, np.nan, df_final['true'])
df_final.to_parquet(f'{agg_base}aggregated/baseline.parquet.gzip',
        compression='gzip', index=False)

100%|██████████| 74/74 [00:54<00:00,  1.37it/s]


In [4]:
### Separating posts from RTs
country = 'SA'
path_tw, base, rand, baseline, agg, agg_base = get_path(country, 'march')

df_final = pd.DataFrame()
# for i in tqdm(range(0, 84)): #KE
for i in tqdm(range(0, 74)): #SA
    df = pd.read_parquet(f'{agg_base}predicted/baseline_{i}.parquet.gzip')
    df_final = pd.concat([df_final, df])

100%|██████████| 74/74 [02:06<00:00,  1.70s/it]


In [5]:
df_final = df_final.reset_index(drop = True)

df_RT = df_final[(df_final['text'].str.contains('RT @', case=True, regex=False)) & 
                 (df_final['total_comments'] == 0)]
df_RT = df_RT.reset_index(drop=True)

df_no_rt = df_final[(~df_final['text'].str.contains('RT @', 
                    case=True, regex=False)) | 
                    (df_final['total_comments'] > 0)].reset_index(drop=True)

In [6]:
len(df_RT) + len(df_no_rt) == len(df_final)

True

In [7]:
df_rt_agg = summ_followers2(df_RT).reset_index(drop=True)
df_no_rt_agg = summ_followers2(df_no_rt).reset_index(drop=True)

In [8]:
cols = [col for col in df_rt_agg.columns if 'total_' in col] + ['verifiability', 'true', 'n_posts']
df_rt_agg.rename(columns = 
        {col: col + '_rt_base' for col in df_rt_agg.columns if col in cols}, 
        inplace=True)

df_no_rt_agg.rename(columns = 
        {col: col + '_no_rt_base' for col in df_no_rt_agg.columns if col in cols}, 
        inplace=True)

df_rt_agg.rename(columns = {'handle': 'username', 
                       'author_id':'follower_id'}, inplace = True)
df_no_rt_agg.rename(columns = {'handle': 'username', 
                       'author_id':'follower_id'}, inplace = True)

In [10]:
## Merginig with Treatment data set:
base = pd.read_parquet(f'../../data/04-analysis/{country}/baseline_features.parquet')
filter = pd.read_parquet(f'../../data/04-analysis/{country}/baseline_features_filter.parquet')

base = base[['username', 'follower_id', 'ads_treatment', 
             'strat_block1', 'strat_block2', 'c_t_strong_total', 
             'c_t_weak_total', 'c_t_neither_total', 't_strong',
             't_weak', 't_neither', 'total_shares_sum',
             'total_reactions_sum', 'total_comments_sum', 
             'verifiability_base', 'n_posts_base', 'true_base']]

base.rename(columns = {'total_reactions_sum': 'reactions_base', 
                       'total_shares_sum':'shares_base', 
                       'total_comments_sum': 'comments_base'}, inplace = True)

filter = filter[['username', 'follower_id', 'n_posts_base']]

filter['bot_account'] = np.where(filter['n_posts_base'].isnull(), 1, 0)

filter = filter[['follower_id', 'bot_account']]

base = base.merge(filter, on = 'follower_id', how = 'left')

base.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44201 entries, 0 to 44200
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   username            44201 non-null  object 
 1   follower_id         44201 non-null  object 
 2   ads_treatment       44201 non-null  float64
 3   strat_block1        44201 non-null  object 
 4   strat_block2        44201 non-null  object 
 5   c_t_strong_total    44201 non-null  int32  
 6   c_t_weak_total      44201 non-null  int32  
 7   c_t_neither_total   44201 non-null  int32  
 8   t_strong            44201 non-null  float64
 9   t_weak              44201 non-null  float64
 10  t_neither           44201 non-null  float64
 11  shares_base         36800 non-null  float64
 12  reactions_base      36800 non-null  float64
 13  comments_base       36800 non-null  float64
 14  verifiability_base  36800 non-null  float64
 15  n_posts_base        44201 non-null  float64
 16  true

In [11]:
# Merge:
base = base.merge(df_rt_agg, on=['follower_id', 'username'], 
                  how='left')

base = base.merge(df_no_rt_agg, on=['follower_id', 'username'], 
                  how='left')

In [12]:
cols_base = [col for col in base.columns if '_base' in col]
for x in cols_base:
    base[x] = np.where(base[x].isnull(), 0, base[x])

In [13]:
base.to_parquet(f'../../data/04-analysis/{country}/baseline_rt.parquet')