In [1]:
import pandas as pd
import os
import glob
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category = FutureWarning)
from tqdm import tqdm

import re

import sys
sys.path.insert(0, '../../../src/utils')
from import_data import *

def convert_to_numeric(characters):
    try:
        numeric_values = re.sub(',', '', characters)
        numeric_values = pd.to_numeric(re.sub("K|M|mil", "", numeric_values))
        if 'K' in characters:
            numeric_values *= 1000
        elif 'mil' in characters:
            numeric_values *= 1000
        elif 'M' in characters:
            numeric_values *= 1000000
        else:
            numeric_values = numeric_values
    except:
        numeric_values = 0
    return numeric_values

def get_path(country, week = 'march'):
    base = f'../../../data/03-experiment/{country}/'
    path_tw = base + f'treatment/followers/00-raw/tweets/{week}/'
    rand = f'../../../data/02-randomize/{country}/04-stratification/integrate/followers_randomized.parquet'
    baseline = base + 'baseline/00-raw/followers/tweets/'
    agg = base + f'treatment/followers/01-preprocess/'
    agg_base = base + 'baseline/01-preprocess/followers/'
    return path_tw, base, rand, baseline, agg, agg_base


def summ_followers2(df):
    if 'like_count' in df.columns:
        df = df.rename(columns={'like_count':'total_likes'})
    else:
        df = df.rename(columns={'public_metrics.like_count':'total_likes'}) 
    df['t_verifiability_likes'] = df['verifiability'] * df['total_likes']
    df['t_verifiability_shares'] = df['verifiability'] * df['total_shares']
    df['t_verifiability_comments'] = df['verifiability'] * df['total_comments']
    df['t_verifiability_reactions'] = df['verifiability'] * df['total_reactions']
    df['t_eng_likes'] = df['eng'] * df['total_likes']
    df['t_eng_shares'] = df['eng'] * df['total_shares']
    df['t_eng_comments'] = df['eng'] * df['total_comments']
    df['t_eng_reactions'] = df['eng'] * df['total_reactions']
    df['t_non_ver_likes'] = df['non_ver'] * df['total_likes']
    df['t_non_ver_shares'] = df['non_ver'] * df['total_shares']
    df['t_non_ver_comments'] = df['non_ver'] * df['total_comments']
    df['t_non_ver_reactions'] = df['non_ver'] * df['total_reactions']
    df['t_true_likes'] = df['true'] * df['total_likes']
    df['t_true_shares'] = df['true'] * df['total_shares']
    df['t_true_comments'] = df['true'] * df['total_comments']
    df['t_true_reactions'] = df['true'] * df['total_reactions']
    df['t_fake_likes'] = df['fake'] * df['total_likes']
    df['t_fake_shares'] = df['fake'] * df['total_shares']
    df['t_fake_comments'] = df['fake'] * df['total_comments']
    df['t_fake_reactions'] = df['fake'] * df['total_reactions']
    
    cols = (['total_likes', 'total_shares', 'total_comments', 'total_reactions'] + 
            [col for col in df.columns if 't_' in col])
    df_agg = df[['username'] + 
        cols].groupby(['username']).sum().reset_index()
    
    return df_agg

In [2]:
for country in ['KE', 'SA']:
    # Get the treatment information from each follower
    base1 = pd.read_parquet(f'../../../data/04-analysis/{country}/treatment_info/information_batch2.parquet')

    # These are only observations from April or March 2023, we need to complete them using the correct cases from Joaquin
    df_final = get_data_base_batch22(country,  base_path = '../../../../')
    df_final['fake'] = np.where((df_final['verifiability'] == 1) & (df_final['true'] == 0), 1, 0)
    df_final['eng'] = np.where((df_final['lang'] == 'en'), 1, 0)
    df_final['fake'] = np.where((df_final['verifiability'] == 0), np.nan, df_final['fake'])
    df_final['non_ver'] = np.where((df_final['verifiability'] == 0) & (~df_final['verifiability'].isnull()),
                                   1, 0)
    df_final = df_final.merge(base1[['follower_id', 'username']], left_on = 'author_id', 
                          right_on = 'follower_id', how = 'left').drop(['author_id', 'follower_id'], axis = 1)
    
    if country == 'SA':
        date_base = '2023-04-01'
    else: 
        date_base = '2023-03-01'
    
    # This can potentially change:
    df_f = pd.read_parquet(f'../../../data/03-experiment/{country}/treatment/followers/01-preprocess/correct_cases_final.parquet.gzip')
    df_f = df_f[df_f['date']<date_base].drop(['date', 'reposted'], axis = 1)
    df_f['eng'] = np.where((df_f['lang'] == 'en'), 1, 0)
    #df_f['author_id'] = df_f['username']
    
    df_f_no_RT = df_f[df_f['type'].isnull()].drop(['type'], axis = 1)
    
    print('Aggregation:')
    #df_final['author_id'] = df_final['username']

    df_no_rt = df_final[(~df_final['text'].str.contains('RT @', 
                    case=True, regex=False)) | 
                    (df_final['total_comments'] > 0)].reset_index(drop=True)

    df_no_rt = pd.concat([df_f_no_RT, df_no_rt]).reset_index(drop=True)

    df_no_rt_agg = summ_followers2(df_no_rt).reset_index(drop=True)

    cols = ([col for col in df_no_rt_agg.columns if 't_' in col] + 
            ['total_likes', 'total_shares', 'total_comments', 'total_reactions'])

    df_no_rt_agg.rename(columns = 
        {col: col + '_base' for col in df_no_rt_agg.columns if col in cols}, 
        inplace=True)

    #df_no_rt_agg = df_no_rt_agg.drop(['author_id'], axis=1)

    base1 = base1.merge(df_no_rt_agg, on=['username'], 
                  how='left')

    cols_base = [col for col in base1.columns if '_base' in col]
    for x in cols_base:
        base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
    base1.to_parquet(f'../../../data/04-analysis/{country}/baseline/baseline_batch2_interactions.parquet')
    print(country)

100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:29<00:00,  2.10s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:25<00:00,  1.80s/it]


Aggregation:
KE


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:11<00:00,  1.19s/it]


Aggregation:
SA


In [3]:
base1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28641 entries, 0 to 28640
Data columns (total 38 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   username                        28641 non-null  object 
 1   follower_id                     28641 non-null  object 
 2   blockid1                        27569 non-null  float64
 3   blockid2                        27569 non-null  float64
 4   ads_treatment                   28641 non-null  float64
 5   id                              28641 non-null  object 
 6   c_t_strong_total                28641 non-null  int32  
 7   c_t_weak_total                  28641 non-null  int32  
 8   c_t_neither_total               28641 non-null  int32  
 9   t_strong                        28641 non-null  float64
 10  t_weak                          28641 non-null  float64
 11  t_neither                       28641 non-null  float64
 12  strat_block1                    