In [1]:
import pandas as pd
import os
import glob
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category = FutureWarning)
from tqdm import tqdm
import sys
import re

sys.path.insert(0, '../../../src/utils')
from import_data import *

def convert_to_numeric(characters):
    try:
        numeric_values = re.sub(',', '', characters)
        numeric_values = pd.to_numeric(re.sub("K|M|mil", "", numeric_values))
        if 'K' in characters:
            numeric_values *= 1000
        elif 'mil' in characters:
            numeric_values *= 1000
        elif 'M' in characters:
            numeric_values *= 1000000
        else:
            numeric_values = numeric_values
    except:
        numeric_values = 0
    return numeric_values

def get_path(country, week = 'march'):
    base = f'../../../data/03-experiment/{country}/'
    path_tw = base + f'treatment/followers/00-raw/tweets/{week}/'
    rand = f'../../../data/02-randomize/{country}/04-stratification/integrate/followers_randomized.parquet'
    baseline = base + 'baseline/00-raw/followers/tweets/'
    agg = base + f'treatment/followers/01-preprocess/'
    agg_base = base + 'baseline/01-preprocess/followers/'
    return path_tw, base, rand, baseline, agg, agg_base


def summ_followers2(df):
    cols = ['eng', 'n_eng']

    df_agg = df[['username'] + 
        cols].groupby(['username']).sum().reset_index()

    return df_agg

def divide_and_conquer(df_final):
    df_RT = df_final[(df_final['text'].str.contains('RT @', case=True, regex=False)) & 
                 (df_final['total_comments'] == 0)]
    df_RT = df_RT.reset_index(drop=True)

    df_no_rt = df_final[(~df_final['text'].str.contains('RT @', 
                    case=True, regex=False)) | 
                    (df_final['total_comments'] > 0)].reset_index(drop=True)
    
    df_rt_agg = summ_followers2(df_RT).reset_index(drop=True)
    df_no_rt_agg = summ_followers2(df_no_rt).reset_index(drop=True)

    cols = ['eng' , 'n_eng']
    df_rt_agg.rename(columns = 
        {col: col + '_rt' for col in df_rt_agg.columns if col in cols}, 
        inplace=True)

    df_no_rt_agg.rename(columns = 
        {col: col + '_no_rt' for col in df_no_rt_agg.columns if col in cols}, 
        inplace=True)
    
    return df_rt_agg, df_no_rt_agg

In [2]:
for country in ['KE', 'SA']:
    base1 = pd.read_parquet(f'../../../data/04-analysis/{country}/baseline/baseline_english_batch2.parquet')
    stage1_2 = get_data_stage12_batch2(country, base_path = '../../../../')
    stage1_2['eng'] = np.where((stage1_2['lang2'] == 'en'), 1, 0)
    stage1_2['n_eng'] = np.where((stage1_2['lang2'] != 'en'), 1, 0)
    stage1_2 = stage1_2.merge(base1[['follower_id', 'username']], left_on = 'author_id', 
                          right_on = 'follower_id', how = 'left').drop(['author_id', 'follower_id'], axis = 1)
    stage1_2 = stage1_2[~stage1_2['username'].isnull()]
    
    cols = ['eng' , 'n_eng']
    
    stage1_2agg = summ_followers2(stage1_2)
    df_rt_agg, df_no_rt_agg = divide_and_conquer(stage1_2)
    base1 = base1.merge(df_rt_agg, on=['username'], 
                  how='left')

    base1 = base1.merge(df_no_rt_agg, on=['username'], 
                  how='left')

    base1 = base1.merge(stage1_2agg, on=['username'], 
                  how='left')

    cols_base = ([col for col in base1.columns if '_rt' in col] + 
             [col for col in base1.columns if '_no_rt' in col] + 
             cols)

    for x in cols_base:
        base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
    base1.to_parquet(f'../../../data/04-analysis/{country}/stage1_2/english_batch2.parquet', 
                index=False)
    
#3-4 and 5-6

for country in ['KE', 'SA']:
    df_f = pd.read_parquet(f'../../../data/03-experiment/{country}/treatment/followers/01-preprocess/correct_cases_final.parquet.gzip')
    df_f['eng'] = np.where((df_f['lang'] == 'en'), 1, 0)
    df_f['n_eng'] = np.where((df_f['lang'] != 'en'), 1, 0)
    base1 = pd.read_parquet(f'../../../data/04-analysis/{country}/baseline/baseline_english_batch2.parquet')
    stage3_4 = df_f[(df_f['date']<'2023-06-26') & 
                (df_f['date']>'2023-05-28')].drop(['date', 'reposted'], axis = 1)
    stage3_4_RT = stage3_4[~stage3_4['type'].isnull()].drop(['type'], axis = 1)
    stage3_4_no_RT = stage3_4[stage3_4['type'].isnull()].drop(['type'], axis = 1)
    df_agg = summ_followers2(stage3_4).reset_index(drop=True)
    df_rt_agg = summ_followers2(stage3_4_RT).reset_index(drop=True)
    df_no_rt_agg = summ_followers2(stage3_4_no_RT).reset_index(drop=True)
    cols = ['eng', 'n_eng']

    df_rt_agg.rename(columns = 
        {col: col + '_rt' for col in df_rt_agg.columns if col in cols}, 
        inplace=True)

    df_no_rt_agg.rename(columns = 
        {col: col + '_no_rt' for col in df_no_rt_agg.columns if col in cols}, 
        inplace=True)

    base1 = base1.merge(df_rt_agg, on=['username'], 
                  how='left')

    base1 = base1.merge(df_agg, on=['username'], 
                  how='left')

    base1 = base1.merge(df_no_rt_agg, on=['username'], 
                  how='left')

    cols_base = ([col for col in base1.columns if '_rt' in col] + 
             [col for col in base1.columns if '_no_rt' in col] + 
             cols)
    for x in cols_base:
        base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
    base1.to_parquet(f'../../../data/04-analysis/{country}/stage3_4/english_batch2.parquet', 
                index=False)
    
    base1 = pd.read_parquet(f'../../../data/04-analysis/{country}/baseline/baseline_english_batch2.parquet')
    stage3_4 = df_f[(df_f['date']<'2023-07-23') & 
                (df_f['date']>'2023-06-25')].drop(['date', 'reposted'], axis = 1)
    stage3_4_RT = stage3_4[~stage3_4['type'].isnull()].drop(['type'], axis = 1)
    stage3_4_no_RT = stage3_4[stage3_4['type'].isnull()].drop(['type'], axis = 1)
    df_agg = summ_followers2(stage3_4).reset_index(drop=True)
    df_rt_agg = summ_followers2(stage3_4_RT).reset_index(drop=True)
    df_no_rt_agg = summ_followers2(stage3_4_no_RT).reset_index(drop=True)
    cols = ['eng', 'n_eng']

    df_rt_agg.rename(columns = 
        {col: col + '_rt' for col in df_rt_agg.columns if col in cols}, 
        inplace=True)

    df_no_rt_agg.rename(columns = 
        {col: col + '_no_rt' for col in df_no_rt_agg.columns if col in cols}, 
        inplace=True)

    base1 = base1.merge(df_rt_agg, on=['username'], 
                  how='left')

    base1 = base1.merge(df_agg, on=['username'], 
                  how='left')

    base1 = base1.merge(df_no_rt_agg, on=['username'], 
                  how='left')

    cols_base = ([col for col in base1.columns if '_rt' in col] + 
             [col for col in base1.columns if '_no_rt' in col] + 
             cols)
    for x in cols_base:
        base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
    base1.to_parquet(f'../../../data/04-analysis/{country}/stage5_6/english_batch2.parquet', 
                index=False)
    print(country)

KE
SA


In [3]:
base1

Unnamed: 0,username,follower_id,blockid1,blockid2,ads_treatment,id,c_t_strong_total,c_t_weak_total,c_t_neither_total,t_strong,...,eng_base,n_eng_base,eng_no_rt_base,n_eng_no_rt_base,eng_rt,n_eng_rt,eng,n_eng,eng_no_rt,n_eng_no_rt
0,milestunhi,1000263301182476289,184.0,189.0,0.0,11111111,0,1,0,0.0,...,9.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,JaravazaVanessa,1001406436478513152,143.0,100.0,1.0,11111111,0,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,cshndi,1001833216402251776,207.0,241.0,1.0,11111111,0,1,0,0.0,...,62.0,2.0,1.0,2.0,7.0,1.0,7.0,2.0,0.0,1.0
3,givmangwende,1003813557073530880,202.0,232.0,1.0,11111111,0,1,0,0.0,...,39.0,11.0,39.0,11.0,0.0,0.0,13.0,6.0,13.0,6.0
4,MakoMidzi,1003945105517109249,184.0,189.0,1.0,11111111,0,1,0,0.0,...,25.0,4.0,2.0,1.0,9.0,0.0,11.0,0.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28636,Ke_georginah,2270946743,1.0,2.0,1.0,222222222a,1,1,0,0.0,...,226.0,64.0,46.0,34.0,13.0,4.0,17.0,7.0,4.0,3.0
28637,briantau1,251925293,1.0,1.0,1.0,222222222a,1,1,0,0.0,...,124.0,35.0,91.0,32.0,5.0,1.0,15.0,8.0,10.0,7.0
28638,Zama_B02,2678104934,1.0,2.0,0.0,222222222a,1,1,0,0.0,...,56.0,14.0,53.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0
28639,daddyhope,73672445,3.0,5.0,0.0,222222222a,1,1,0,0.0,...,615.0,27.0,534.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0
