In [1]:
import pandas as pd
import os
import glob
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category = FutureWarning)
from tqdm import tqdm
import sys
import re

sys.path.insert(0, '../../../src/utils')
from import_data import *

def summ_followers2(df):
    if 'like_count' in df.columns:
        df = df.rename(columns={'like_count':'total_likes'})
    else:
        df = df.rename(columns={'public_metrics.like_count':'total_likes'}) 
    df['t_verifiability_likes'] = df['verifiability'] * df['total_likes']
    df['t_verifiability_shares'] = df['verifiability'] * df['total_shares']
    df['t_verifiability_comments'] = df['verifiability'] * df['total_comments']
    df['t_verifiability_reactions'] = df['verifiability'] * df['total_reactions']
    df['t_eng_likes'] = df['eng'] * df['total_likes']
    df['t_eng_shares'] = df['eng'] * df['total_shares']
    df['t_eng_comments'] = df['eng'] * df['total_comments']
    df['t_eng_reactions'] = df['eng'] * df['total_reactions']
    df['t_non_ver_likes'] = df['non_ver'] * df['total_likes']
    df['t_non_ver_shares'] = df['non_ver'] * df['total_shares']
    df['t_non_ver_comments'] = df['non_ver'] * df['total_comments']
    df['t_non_ver_reactions'] = df['non_ver'] * df['total_reactions']
    df['t_true_likes'] = df['true'] * df['total_likes']
    df['t_true_shares'] = df['true'] * df['total_shares']
    df['t_true_comments'] = df['true'] * df['total_comments']
    df['t_true_reactions'] = df['true'] * df['total_reactions']
    df['t_fake_likes'] = df['fake'] * df['total_likes']
    df['t_fake_shares'] = df['fake'] * df['total_shares']
    df['t_fake_comments'] = df['fake'] * df['total_comments']
    df['t_fake_reactions'] = df['fake'] * df['total_reactions']
    
    cols = (['total_likes', 'total_shares', 'total_comments', 'total_reactions'] + 
            [col for col in df.columns if 't_' in col])
    df_agg = df[['author_id'] + 
        cols].groupby(['author_id']).sum().reset_index()
    
    return df_agg

#This I do recommend running it first for one country, restart kernel, and run for the other country:

In [2]:
country = 'SA'

base1 = pd.read_parquet(f'../../../data/04-analysis/{country}/treatment_info/information_batch1.parquet')

df_final = get_baseline_data_b1(country, type_data = 'predicted', base_path = '../../../')
df_final['fake'] = np.where((df_final['verifiability'] == 1) & (df_final['true'] == 0), 1, 0)
df_final['eng'] = np.where((df_final['lang'] == 'en'), 1, 0)
df_final['fake'] = np.where((df_final['verifiability'] == 0), np.nan, df_final['fake'])
df_final['non_ver'] = np.where((df_final['verifiability'] == 0) & (~df_final['verifiability'].isnull()),
                                   1, 0)

df_no_rt = df_final[(~df_final['text'].str.contains('RT @', 
                    case=True, regex=False)) | 
                    (df_final['total_comments'] > 0)].reset_index(drop=True)

df_no_rt_agg = summ_followers2(df_no_rt).reset_index(drop=True)

cols = (['total_likes', 'total_shares', 'total_comments', 'total_reactions'] + 
            [col for col in df_no_rt_agg.columns if 't_' in col])

df_no_rt_agg.rename(columns = 
        {col: col + '_base' for col in df_no_rt_agg.columns if col in cols}, 
        inplace=True)

df_no_rt_agg.rename(columns = { 
                       'author_id':'follower_id'}, inplace = True)


base1 = base1.merge(df_no_rt_agg, on=['follower_id'], 
                    how='left')

cols_base = [col for col in base1.columns if '_base' in col]
for x in cols_base:
    base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
base1.to_parquet(f'../../../data/04-analysis/{country}/baseline/baseline_batch1_interactions.parquet')

100%|██████████████████████████████████████████████████████████████████████████████████| 74/74 [02:17<00:00,  1.86s/it]


In [2]:
country = 'KE'

base1 = pd.read_parquet(f'../../../data/04-analysis/{country}/treatment_info/information_batch1.parquet')

df_final = get_baseline_data_b1(country, type_data = 'predicted', base_path = '../../../')
df_final['fake'] = np.where((df_final['verifiability'] == 1) & (df_final['true'] == 0), 1, 0)
df_final['eng'] = np.where((df_final['lang'] == 'en'), 1, 0)
df_final['fake'] = np.where((df_final['verifiability'] == 0), np.nan, df_final['fake'])
df_final['non_ver'] = np.where((df_final['verifiability'] == 0) & (~df_final['verifiability'].isnull()),
                                   1, 0)

df_no_rt = df_final[(~df_final['text'].str.contains('RT @', 
                    case=True, regex=False)) | 
                    (df_final['total_comments'] > 0)].reset_index(drop=True)

df_no_rt_agg = summ_followers2(df_no_rt).reset_index(drop=True)

cols = (['total_likes', 'total_shares', 'total_comments', 'total_reactions'] + 
            [col for col in df_no_rt_agg.columns if 't_' in col])

df_no_rt_agg.rename(columns = 
        {col: col + '_base' for col in df_no_rt_agg.columns if col in cols}, 
        inplace=True)

df_no_rt_agg.rename(columns = { 
                       'author_id':'follower_id'}, inplace = True)


base1 = base1.merge(df_no_rt_agg, on=['follower_id'], 
                    how='left')

cols_base = [col for col in base1.columns if '_base' in col]
for x in cols_base:
    base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
base1.to_parquet(f'../../../data/04-analysis/{country}/baseline/baseline_batch1_interactions.parquet')

100%|██████████████████████████████████████████████████████████████████████████████████| 84/84 [04:06<00:00,  2.94s/it]


In [3]:
base1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102534 entries, 0 to 102533
Data columns (total 36 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   username                        102534 non-null  object 
 1   follower_id                     102534 non-null  object 
 2   ads_treatment                   102534 non-null  float64
 3   id                              102534 non-null  object 
 4   c_t_strong_total                102534 non-null  int32  
 5   c_t_weak_total                  102534 non-null  int32  
 6   c_t_neither_total               102534 non-null  int32  
 7   t_strong                        102534 non-null  float64
 8   t_weak                          102534 non-null  float64
 9   t_neither                       102534 non-null  float64
 10  strat_block1                    102534 non-null  object 
 11  strat_block2                    102534 non-null  object 
 12  total_likes_base