In [1]:
import pandas as pd
import os
import glob
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category = FutureWarning)
from tqdm import tqdm
import sys
import re

sys.path.insert(0, '../../../src/utils')
from import_data import *

def convert_to_numeric(characters):
    try:
        numeric_values = re.sub(',', '', characters)
        numeric_values = pd.to_numeric(re.sub("K|M|mil", "", numeric_values))
        if 'K' in characters:
            numeric_values *= 1000
        elif 'mil' in characters:
            numeric_values *= 1000
        elif 'M' in characters:
            numeric_values *= 1000000
        else:
            numeric_values = numeric_values
    except:
        numeric_values = 0
    return numeric_values

def get_path(country, week = 'march'):
    base = f'../../../data/03-experiment/{country}/'
    path_tw = base + f'treatment/followers/00-raw/tweets/{week}/'
    rand = f'../../../data/02-randomize/{country}/04-stratification/integrate/followers_randomized.parquet'
    baseline = base + 'baseline/00-raw/followers/tweets/'
    agg = base + f'treatment/followers/01-preprocess/'
    agg_base = base + 'baseline/01-preprocess/followers/'
    return path_tw, base, rand, baseline, agg, agg_base


def summ_followers2(df):
    if 'like_count' in df.columns:
        df = df.rename(columns={'like_count':'total_likes'})
    else:
        df = df.rename(columns={'public_metrics.like_count':'total_likes'}) 
    df['t_verifiability_likes'] = df['verifiability'] * df['total_likes']
    df['t_verifiability_shares'] = df['verifiability'] * df['total_shares']
    df['t_verifiability_comments'] = df['verifiability'] * df['total_comments']
    df['t_verifiability_reactions'] = df['verifiability'] * df['total_reactions']
    df['t_eng_likes'] = df['eng'] * df['total_likes']
    df['t_eng_shares'] = df['eng'] * df['total_shares']
    df['t_eng_comments'] = df['eng'] * df['total_comments']
    df['t_eng_reactions'] = df['eng'] * df['total_reactions']
    df['t_non_ver_likes'] = df['non_ver'] * df['total_likes']
    df['t_non_ver_shares'] = df['non_ver'] * df['total_shares']
    df['t_non_ver_comments'] = df['non_ver'] * df['total_comments']
    df['t_non_ver_reactions'] = df['non_ver'] * df['total_reactions']
    df['t_true_likes'] = df['true'] * df['total_likes']
    df['t_true_shares'] = df['true'] * df['total_shares']
    df['t_true_comments'] = df['true'] * df['total_comments']
    df['t_true_reactions'] = df['true'] * df['total_reactions']
    df['t_fake_likes'] = df['fake'] * df['total_likes']
    df['t_fake_shares'] = df['fake'] * df['total_shares']
    df['t_fake_comments'] = df['fake'] * df['total_comments']
    df['t_fake_reactions'] = df['fake'] * df['total_reactions']
    
    cols = (['total_likes', 'total_shares', 'total_comments', 'total_reactions'] + 
            [col for col in df.columns if 't_' in col])
    df_agg = df[['username'] + 
        cols].groupby(['username']).sum().reset_index()
    
    return df_agg
    
def divide_and_conquer(df_final):
    df_no_rt = df_final[(~df_final['text'].str.contains('RT @', 
                    case=True, regex=False)) | 
                    (df_final['total_comments'] > 0)].reset_index(drop=True)
    
    df_no_rt_agg = summ_followers2(df_no_rt).reset_index(drop=True)
    
    return df_no_rt_agg

In [2]:
for country in ['KE', 'SA']:
    base1 = pd.read_parquet(f'../../../data/04-analysis/{country}/baseline/baseline_batch2_interactions.parquet')
    stage1_2 = get_data_stage12_batch2(country, base_path = '../../../../')
    stage1_2['fake'] = np.where((stage1_2['verifiability'] == 1) & (stage1_2['true'] == 0), 1, 0)
    stage1_2['eng'] = np.where((stage1_2['lang2'] == 'en'), 1, 0)
    stage1_2['fake'] = np.where((stage1_2['verifiability'] == 0), np.nan, stage1_2['fake'])
    stage1_2['non_ver'] = np.where((stage1_2['verifiability'] == 0) & (~stage1_2['verifiability'].isnull()),
                                   1, 0)
    stage1_2 = stage1_2.merge(base1[['follower_id', 'username']], left_on = 'author_id', 
                          right_on = 'follower_id', how = 'left').drop(['author_id', 'follower_id'], axis = 1)
    stage1_2 = stage1_2[~stage1_2['username'].isnull()]
    cols = [col for col in stage1_2.columns if 't_' in col]
    #stage1_2['author_id'] = stage1_2['username']
    df_no_rt_agg = divide_and_conquer(stage1_2)
    
    base1 = base1.merge(df_no_rt_agg, on=['username'], 
                  how='left')

    cols_base = (['total_likes', 'total_shares', 'total_comments', 'total_reactions'] + 
            [col for col in base1.columns if 't_' in col])

    for x in cols_base:
        base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
    base1.to_parquet(f'../../../data/04-analysis/{country}/stage1_2/verifiability_batch2_interactions.parquet', 
                index=False)
    
#3-4 and 5-6

for country in ['KE', 'SA']:
    df_f = pd.read_parquet(f'../../../data/03-experiment/{country}/treatment/followers/01-preprocess/correct_cases_final.parquet.gzip')
    df_f['eng'] = np.where((df_f['lang'] == 'en'), 1, 0)
    base1 = pd.read_parquet(f'../../../data/04-analysis/{country}/baseline/baseline_batch2_interactions.parquet')
    stage3_4 = df_f[(df_f['date']<'2023-06-26') & 
                (df_f['date']>'2023-05-28')].drop(['date', 'reposted'], axis = 1)
    stage3_4['non_ver'] = np.where((stage3_4['verifiability'] == 0) & (~stage3_4['verifiability'].isnull()),
                                   1, 0)
    stage3_4_RT = stage3_4[~stage3_4['type'].isnull()].drop(['type'], axis = 1)
    stage3_4_no_RT = stage3_4[stage3_4['type'].isnull()].drop(['type'], axis = 1)
    df_no_rt_agg = summ_followers2(stage3_4_no_RT).reset_index(drop=True)

    base1 = base1.merge(df_no_rt_agg, on=['username'], 
                  how='left')

    cols_base = (['total_likes', 'total_shares', 'total_comments', 'total_reactions'] + 
            [col for col in base1.columns if 't_' in col])
    
    for x in cols_base:
        base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
    base1.to_parquet(f'../../../data/04-analysis/{country}/stage3_4/verifiability_batch2_interactions.parquet', 
                index=False)
    
    base1 = pd.read_parquet(f'../../../data/04-analysis/{country}/baseline/baseline_batch2_interactions.parquet')
    stage3_4 = df_f[(df_f['date']<'2023-07-23') & 
                (df_f['date']>'2023-06-25')].drop(['date', 'reposted'], axis = 1)
    stage3_4['non_ver'] = np.where((stage3_4['verifiability'] == 0) & (~stage3_4['verifiability'].isnull()),
                                   1, 0)
    
    stage3_4_no_RT = stage3_4[stage3_4['type'].isnull()].drop(['type'], axis = 1)
    df_no_rt_agg = summ_followers2(stage3_4_no_RT).reset_index(drop=True)

    base1 = base1.merge(df_no_rt_agg, on=['username'], 
                  how='left')

    cols_base = (['total_likes', 'total_shares', 'total_comments', 'total_reactions'] + 
            [col for col in base1.columns if 't_' in col])
    for x in cols_base:
        base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
    base1.to_parquet(f'../../../data/04-analysis/{country}/stage5_6/verifiability_batch2_interactions.parquet', 
                index=False)
    print(country)

KE
SA


In [None]:
base1.info()