In [1]:
import pandas as pd
import os
import glob
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category = FutureWarning)
from tqdm import tqdm
import sys
import re

sys.path.insert(0, '../../../src/utils')
from import_data import *

def summ_followers2(df):
    metrics = [col for col in df.columns if 'total_' in col]
    cols = metrics + ['verifiability', 'non_ver', 'true', 'fake']

    df_agg = df[['handle', 'author_id'] + 
        cols].groupby(['handle', 'author_id']).sum().reset_index()

    df_count = df[['handle', 'author_id']].groupby(['author_id']).count()
    df_count.rename({'handle': 'n_posts'}, axis=1, inplace=True)

    df_agg = df_agg.merge(df_count, on=['author_id'], how='left')

    return df_agg

#This I do recommend running it first for one country, restart kernel, and run for the other country:

In [3]:
country = 'SA'

base1 = pd.read_parquet(f'../../../data/04-analysis/{country}/treatment_info/information_batch1.parquet')

df_final = get_baseline_data_b1(country, type_data = 'predicted', base_path = '../../../')
df_final['fake'] = np.where((df_final['verifiability'] == 1) & (df_final['true'] == 0), 1, 0)
df_final['fake'] = np.where((df_final['verifiability'] == 0), np.nan, df_final['fake'])
df_final['non_ver'] = np.where((df_final['verifiability'] == 0) & (~df_final['verifiability'].isnull()),
                                   1, 0)
df_RT = df_final[(df_final['text'].str.contains('RT @', case=True, regex=False)) & 
                 (df_final['total_comments'] == 0)].reset_index(drop=True)

df_no_rt = df_final[(~df_final['text'].str.contains('RT @', 
                    case=True, regex=False)) | 
                    (df_final['total_comments'] > 0)].reset_index(drop=True)

df_final_agg = summ_followers2(df_final).reset_index(drop = True)
df_rt_agg = summ_followers2(df_RT).reset_index(drop=True)
df_no_rt_agg = summ_followers2(df_no_rt).reset_index(drop=True)

cols = [col for col in df_rt_agg.columns if 'total_' in col] + ['verifiability', 'non_ver',
                                                                'true', 'fake',
                                                                'n_posts']

df_final_agg.rename(columns = 
        {col: col + '_base' for col in df_rt_agg.columns if col in cols}, 
        inplace=True)

df_rt_agg.rename(columns = 
        {col: col + '_rt_base' for col in df_rt_agg.columns if col in cols}, 
        inplace=True)

df_no_rt_agg.rename(columns = 
        {col: col + '_no_rt_base' for col in df_no_rt_agg.columns if col in cols}, 
        inplace=True)

df_final_agg.rename(columns = {'handle': 'username', 
                       'author_id':'follower_id'}, inplace = True)
df_rt_agg.rename(columns = {'handle': 'username', 
                       'author_id':'follower_id'}, inplace = True)
df_no_rt_agg.rename(columns = {'handle': 'username', 
                       'author_id':'follower_id'}, inplace = True)

base1 = base1.merge(df_final_agg, on=['follower_id', 'username'], 
                    how='left')

base1 = base1.merge(df_rt_agg, on=['follower_id', 'username'], 
                    how='left')

base1 = base1.merge(df_no_rt_agg, on=['follower_id', 'username'], 
                    how='left')

cols_base = [col for col in base1.columns if '_base' in col]
for x in cols_base:
    base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
base1.to_parquet(f'../../../data/04-analysis/{country}/baseline/baseline_batch1.parquet')

100%|██████████████████████████████████████████████████████████████████████████████████| 74/74 [01:54<00:00,  1.55s/it]


In [2]:
country = 'KE'

base1 = pd.read_parquet(f'../../../data/04-analysis/{country}/treatment_info/information_batch1.parquet')

df_final = get_baseline_data_b1(country, type_data = 'predicted', base_path = '../../../')
df_final['fake'] = np.where((df_final['verifiability'] == 1) & (df_final['true'] == 0), 1, 0)
df_final['fake'] = np.where((df_final['verifiability'] == 0), np.nan, df_final['fake'])
df_final['non_ver'] = np.where((df_final['verifiability'] == 0) & (~df_final['verifiability'].isnull()),
                                   1, 0)

df_RT = df_final[(df_final['text'].str.contains('RT @', case=True, regex=False)) & 
                 (df_final['total_comments'] == 0)].reset_index(drop=True)

df_no_rt = df_final[(~df_final['text'].str.contains('RT @', 
                    case=True, regex=False)) | 
                    (df_final['total_comments'] > 0)].reset_index(drop=True)

df_final_agg = summ_followers2(df_final).reset_index(drop = True)
df_rt_agg = summ_followers2(df_RT).reset_index(drop=True)
df_no_rt_agg = summ_followers2(df_no_rt).reset_index(drop=True)

cols = [col for col in df_rt_agg.columns if 'total_' in col] + ['verifiability', 'non_ver', 'true', 'fake',
                                                                'n_posts']

df_final_agg.rename(columns = 
        {col: col + '_base' for col in df_rt_agg.columns if col in cols}, 
        inplace=True)

df_rt_agg.rename(columns = 
        {col: col + '_rt_base' for col in df_rt_agg.columns if col in cols}, 
        inplace=True)

df_no_rt_agg.rename(columns = 
        {col: col + '_no_rt_base' for col in df_no_rt_agg.columns if col in cols}, 
        inplace=True)

df_final_agg.rename(columns = {'handle': 'username', 
                       'author_id':'follower_id'}, inplace = True)
df_rt_agg.rename(columns = {'handle': 'username', 
                       'author_id':'follower_id'}, inplace = True)
df_no_rt_agg.rename(columns = {'handle': 'username', 
                       'author_id':'follower_id'}, inplace = True)

base1 = base1.merge(df_final_agg, on=['follower_id', 'username'], 
                    how='left')

base1 = base1.merge(df_rt_agg, on=['follower_id', 'username'], 
                    how='left')

base1 = base1.merge(df_no_rt_agg, on=['follower_id', 'username'], 
                    how='left')

cols_base = [col for col in base1.columns if '_base' in col]
for x in cols_base:
    base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
base1.to_parquet(f'../../../data/04-analysis/{country}/baseline/baseline_batch1.parquet')

100%|██████████████████████████████████████████████████████████████████████████████████| 84/84 [03:47<00:00,  2.71s/it]
