In [3]:
import pandas as pd
import os
import glob
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category = FutureWarning)
from tqdm import tqdm
import sys
import re

sys.path.insert(0, '../../../src/utils')
from import_data import *

def convert_to_numeric(characters):
    try:
        numeric_values = re.sub(',', '', characters)
        numeric_values = pd.to_numeric(re.sub("K|M|mil", "", numeric_values))
        if 'K' in characters:
            numeric_values *= 1000
        elif 'mil' in characters:
            numeric_values *= 1000
        elif 'M' in characters:
            numeric_values *= 1000000
        else:
            numeric_values = numeric_values
    except:
        numeric_values = 0
    return numeric_values

def get_path(country, week = 'march'):
    base = f'../../../data/03-experiment/{country}/'
    path_tw = base + f'treatment/followers/00-raw/tweets/{week}/'
    rand = f'../../../data/02-randomize/{country}/04-stratification/integrate/followers_randomized.parquet'
    baseline = base + 'baseline/00-raw/followers/tweets/'
    agg = base + f'treatment/followers/01-preprocess/'
    agg_base = base + 'baseline/01-preprocess/followers/'
    return path_tw, base, rand, baseline, agg, agg_base


def summ_followers2(df):
    if 'like_count' in df.columns:
        df = df.rename(columns={'like_count':'total_likes'})
    else:
        df = df.rename(columns={'public_metrics.like_count':'total_likes'}) 
    df['t_verifiability_smi'] = df['verifiability'] * df['interaction_smi']
    df['t_verifiability_ac'] = df['verifiability'] * df['interaction_ac']
    df['t_eng_smi'] = df['eng'] * df['interaction_smi']
    df['t_eng_ac'] = df['eng'] * df['interaction_ac']
    df['t_non_ver_smi'] = df['non_ver'] * df['interaction_smi']
    df['t_non_ver_ac'] = df['non_ver'] * df['interaction_ac']
    df['t_true_smi'] = df['true'] * df['interaction_smi']
    df['t_true_ac'] = df['true'] * df['interaction_ac']
    df['t_fake_smi'] = df['fake'] * df['interaction_smi']
    df['t_fake_ac'] = df['fake'] * df['interaction_ac']

    
    cols = (['interaction_smi', 'interaction_ac'] + 
            [col for col in df.columns if 't_' in col])
    df_agg = df[['username'] + 
        cols].groupby(['username']).sum().reset_index()
    
    return df_agg
    
def divide_and_conquer(df_final):
    df_no_rt = df_final[(df_final['text'].str.contains('RT @', 
                    case=True, regex=False))].reset_index(drop=True)
    
    df_no_rt_agg = summ_followers2(df_no_rt).reset_index(drop=True)
    
    return df_no_rt_agg

smi_ke = pd.read_excel("~/Dropbox/Bolivia_Project/social-media-influencers-africa/data/02-randomize/KE/03-assignment/output/RandomizedTwitterSampleKE.xlsx")
smi_ke_1 = pd.read_excel("~/Dropbox/Bolivia_Project/social-media-influencers-af/data/02-randomize/KE/03-assignment/output/RandomizedTwitterSampleKE.xlsx")
smi_ke_2 = pd.read_excel("~/Dropbox/Bolivia_Project/social-media-influencers-af/data/02-randomize/KE/03-assignment/output/RandomizedTwitterSampleKE_batch2.xlsx")

smi_sa = pd.read_excel("~/Dropbox/Bolivia_Project/social-media-influencers-africa/data/02-randomize/SA/03-assignment/output/RandomizedTwitterSampleSA.xlsx")
smi_sa_1 = pd.read_excel("~/Dropbox/Bolivia_Project/social-media-influencers-af/data/02-randomize/SA/03-assignment/output/RandomizedTwitterSampleSA.xlsx")
smi_sa_2 = pd.read_excel("~/Dropbox/Bolivia_Project/social-media-influencers-af/data/02-randomize/SA/03-assignment/output/RandomizedTwitterSampleSA_batch2.xlsx")

smis = pd.concat([smi_ke,smi_ke_1,smi_ke_2,smi_sa,smi_sa_1,smi_sa_2], ignore_index=True)
smis = smis[smis['treatment']==1]
smis_list = list(smis.username)

In [5]:
for country in ['KE', 'SA']:
    base1 = pd.read_parquet(f'../../../data/04-analysis/{country}/baseline/baseline_batch2_smi_ac.parquet')
    stage1_2 = get_data_stage12_batch2(country, base_path = '../../../../')
    stage1_2['fake'] = np.where((stage1_2['verifiability'] == 1) & (stage1_2['true'] == 0), 1, 0)
    stage1_2['eng'] = np.where((stage1_2['lang2'] == 'en'), 1, 0)
    stage1_2['fake'] = np.where((stage1_2['verifiability'] == 0), np.nan, stage1_2['fake'])
    stage1_2['non_ver'] = np.where((stage1_2['verifiability'] == 0) & (~stage1_2['verifiability'].isnull()),
                                   1, 0)
    stage1_2 = stage1_2.merge(base1[['follower_id', 'username']], left_on = 'author_id', 
                          right_on = 'follower_id', how = 'left').drop(['author_id', 'follower_id'], axis = 1)
    stage1_2 = stage1_2[~stage1_2['username'].isnull()]
    # Extraer el nombre de usuario de los retweets
    stage1_2['reposted'] = stage1_2['text'].str.extract(r'^RT @(\w+):')
    # Completar los valores nulos con la columna handle
    stage1_2['reposted'] = stage1_2['reposted'].fillna(stage1_2['username'])
    stage1_2['interaction_ac'] = (stage1_2['reposted'] == 'AfricaCheck').astype(int)
    stage1_2['interaction_smi'] = (stage1_2['reposted'].isin(smis_list)).astype(int)
    stage1_2 = stage1_2.drop(['reposted'], axis = 1)
    cols = [col for col in stage1_2.columns if 't_' in col]
    #stage1_2['author_id'] = stage1_2['username']
    df_no_rt_agg = divide_and_conquer(stage1_2)
    
    base1 = base1.merge(df_no_rt_agg, on=['username'], 
                  how='left')

    cols_base = (['interaction_smi', 'interaction_ac'] + 
            [col for col in base1.columns if 't_' in col])

    for x in cols_base:
        base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
    base1.to_parquet(f'../../../data/04-analysis/{country}/stage1_2/verifiability_batch2_smi_ac.parquet', 
                index=False)
    
#3-4 and 5-6

for country in ['KE', 'SA']:
    df_f = pd.read_parquet(f'../../../data/03-experiment/{country}/treatment/followers/01-preprocess/correct_cases_final.parquet.gzip')
    df_f['eng'] = np.where((df_f['lang'] == 'en'), 1, 0)
    df_f['interaction_ac'] = (df_f['reposted'] == 'AfricaCheck').astype(int)
    df_f['interaction_smi'] = (df_f['reposted'].isin(smis_list)).astype(int)
    base1 = pd.read_parquet(f'../../../data/04-analysis/{country}/baseline/baseline_batch2_smi_ac.parquet')
    stage3_4 = df_f[(df_f['date']<'2023-06-26') & 
                (df_f['date']>'2023-05-28')].drop(['date', 'reposted'], axis = 1)
    stage3_4['non_ver'] = np.where((stage3_4['verifiability'] == 0) & (~stage3_4['verifiability'].isnull()),
                                   1, 0)
    #only retweets
    stage3_4_no_RT = stage3_4[~stage3_4['type'].isnull()].drop(['type'], axis = 1)
    df_no_rt_agg = summ_followers2(stage3_4_no_RT).reset_index(drop=True)

    base1 = base1.merge(df_no_rt_agg, on=['username'], 
                  how='left')

    cols_base = (['interaction_smi', 'interaction_ac'] + 
            [col for col in base1.columns if 't_' in col])
    
    for x in cols_base:
        base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
    base1.to_parquet(f'../../../data/04-analysis/{country}/stage3_4/verifiability_batch2_smi_ac.parquet', 
                index=False)
    
    base1 = pd.read_parquet(f'../../../data/04-analysis/{country}/baseline/baseline_batch2_smi_ac.parquet')
    stage3_4 = df_f[(df_f['date']<'2023-07-23') & 
                (df_f['date']>'2023-06-25')].drop(['date', 'reposted'], axis = 1)
    stage3_4['non_ver'] = np.where((stage3_4['verifiability'] == 0) & (~stage3_4['verifiability'].isnull()),
                                   1, 0)
    
    stage3_4_no_RT = stage3_4[~stage3_4['type'].isnull()].drop(['type'], axis = 1)
    df_no_rt_agg = summ_followers2(stage3_4_no_RT).reset_index(drop=True)

    base1 = base1.merge(df_no_rt_agg, on=['username'], 
                  how='left')

    cols_base = (['interaction_smi', 'interaction_ac'] + 
            [col for col in base1.columns if 't_' in col])
    for x in cols_base:
        base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
    base1.to_parquet(f'../../../data/04-analysis/{country}/stage5_6/verifiability_batch2_smi_ac.parquet', 
                index=False)
    print(country)

KE
SA


In [7]:
base1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28641 entries, 0 to 28640
Data columns (total 38 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   username                  28641 non-null  object 
 1   follower_id               28641 non-null  object 
 2   blockid1                  27569 non-null  float64
 3   blockid2                  27569 non-null  float64
 4   ads_treatment             28641 non-null  float64
 5   id                        28641 non-null  object 
 6   c_t_strong_total          28641 non-null  int32  
 7   c_t_weak_total            28641 non-null  int32  
 8   c_t_neither_total         28641 non-null  int32  
 9   t_strong                  28641 non-null  float64
 10  t_weak                    28641 non-null  float64
 11  t_neither                 28641 non-null  float64
 12  strat_block1              28641 non-null  object 
 13  strat_block2              28641 non-null  object 
 14  intera