In [2]:
import pandas as pd
import os
import glob
import numpy as np
import time
from os import listdir
from os.path import isfile, join
from tqdm import tqdm
from langdetect import DetectorFactory, detect
DetectorFactory.seed = 0
import re

def get_path(country, week):
    base = f'../../data/03-experiment/{country}/'
    path_tw = base + f'treatment/followers/00-raw/tweets/{week}/'
    rand = f'../../data/02-randomize/{country}/04-stratification/integrate/followers_randomized.parquet'
    baseline = base + 'baseline/00-raw/followers/tweets/'
    baseline2 = base + f'baseline/00-raw/followers/tweets_batch2/{week}/'
    agg = base + f'treatment/followers/01-preprocess/'
    agg_base = base + 'baseline/01-preprocess/followers/'
    return path_tw, base, rand, baseline, baseline2, agg, agg_base

df_vars = ['id', 'handle', 'author_id', 'created_at', 'text', 'lang', 'referenced_tweets',
           'entities.urls','public_metrics.like_count', 'public_metrics.quote_count', 
           'public_metrics.reply_count', 'public_metrics.retweet_count']

import sys
sys.path.insert(0, '../../src/utils')
from funcs import PreprocessTweets
def compute_engagement_tw(engage):
    engage["total_reactions"] = engage[
        [x for x in engage.columns if "public_metrics" in x]
    ].sum(axis="columns")
    engage["total_comments"] = engage[
        ["public_metrics.reply_count", "public_metrics.quote_count"]
    ].sum(axis="columns")
    engage.rename(
        {"public_metrics.retweet_count": "total_shares"}, axis=1, inplace=True
    )
    return(engage)

def lang_detect_na(tweet):
    try:
        lang = detect(tweet)
    except:
        lang = 'NA'
    return lang

def compute_engagement_tw2(engage):
    engage["total_reactions"] = engage[
        ["reply_count", "quote_count", "like_count", 
         'retweet_count']].sum(axis="columns")
    engage["total_comments"] = engage[
        ["reply_count", 
         "quote_count"]].sum(axis="columns")
    engage.rename(
        {"retweet_count": "total_shares"}, axis=1, inplace=True)
    return(engage)

def has_items_tw(df):

    has_items = PreprocessTweets(df).preprocess()
    has_items.drop(['entities.urls', 'description', 'display_url',
    'end', 'expanded_url', 'images', 'media_key', 'start', 
    'status', 'title', 'unwound_url', 'url'], axis=1, inplace=True)
    
    return(has_items)

def has_text(string):
    return bool(re.search(r'[a-zA-Z]', string))


### Baseline:

19557

In [21]:
country = 'SA'
path_tw, base, rand, baseline, baseline2, agg, agg_base = get_path(country, 'april')
onlyfiles = [f for f in listdir(baseline2) if isfile(join(baseline2, f))]
onlyfiles = [f.replace('.parquet', '') for f in onlyfiles]
onlyfiles.sort()
len(onlyfiles)

for i in tqdm(range(0, 10)):
    df_final = pd.DataFrame()
    a = 2000*i 
    b = 2000*(i+1)
    #time.sleep(5)
    for e in onlyfiles[a:b]:
        df = pd.read_parquet(f'{baseline2}{e}.parquet')
        df_final = pd.concat([df_final, df])
    df_final = df_final.reset_index(drop=True)
    df_final['text'] = df_final['text'].astype(str)
    df_final['has_text'] = df_final['text'].apply(has_text)
    # Check if the 'Name' column contains words by splitting and checking the resulting list length
    df_final['has_words'] = df_final['text'].str.split().apply(lambda x: len(x) > 0)
    df_final = compute_engagement_tw2(df_final)
    print(i, a, b)
    df_final.to_parquet(f'{agg_base}intermediate/baseline_batch2_{i}.parquet.gzip', 
                        compression = 'gzip')

  0%|          | 0/10 [00:00<?, ?it/s]

0 0 2000


 10%|█         | 1/10 [00:56<08:32, 56.99s/it]

1 2000 4000


 20%|██        | 2/10 [01:48<07:12, 54.03s/it]

2 4000 6000


 30%|███       | 3/10 [03:31<08:53, 76.26s/it]

3 6000 8000


 40%|████      | 4/10 [05:02<08:12, 82.03s/it]

4 8000 10000


 50%|█████     | 5/10 [06:28<06:57, 83.43s/it]

5 10000 12000


 60%|██████    | 6/10 [07:48<05:29, 82.30s/it]

6 12000 14000


 70%|███████   | 7/10 [09:20<04:16, 85.59s/it]

7 14000 16000


 80%|████████  | 8/10 [10:54<02:56, 88.26s/it]

8 16000 18000


 90%|█████████ | 9/10 [12:22<01:28, 88.06s/it]

9 18000 20000


100%|██████████| 10/10 [13:24<00:00, 80.50s/it]


In [19]:
country = 'KE'
path_tw, base, rand, baseline, baseline2, agg, agg_base = get_path(country, 'april')
onlyfiles = [f for f in listdir(baseline2) if isfile(join(baseline2, f))]
onlyfiles = [f.replace('.parquet', '') for f in onlyfiles]
onlyfiles.sort()
len(onlyfiles)

for i in tqdm(range(0, 14)):
    df_final = pd.DataFrame()
    a = 2000*i 
    b = 2000*(i+1)
    #time.sleep(5)
    for e in onlyfiles[a:b]:
        df = pd.read_parquet(f'{baseline2}{e}.parquet')
        df_final = pd.concat([df_final, df])
    df_final = df_final.reset_index(drop=True)
    df_final['text'] = df_final['text'].astype(str)
    df_final['has_text'] = df_final['text'].apply(has_text)
    # Check if the 'Name' column contains words by splitting and checking the resulting list length
    df_final['has_words'] = df_final['text'].str.split().apply(lambda x: len(x) > 0)
    df_final = compute_engagement_tw2(df_final)
    print(i, a, b)
    df_final.to_parquet(f'{agg_base}intermediate/baseline_batch2_{i}.parquet.gzip', 
                        compression = 'gzip')

  0%|          | 0/14 [00:00<?, ?it/s]

0 0 2000


  7%|▋         | 1/14 [01:12<15:47, 72.90s/it]

1 2000 4000


 14%|█▍        | 2/14 [02:21<14:01, 70.15s/it]

2 4000 6000


 21%|██▏       | 3/14 [03:17<11:43, 63.94s/it]

3 6000 8000


 29%|██▊       | 4/14 [04:21<10:37, 63.73s/it]

4 8000 10000


 36%|███▌      | 5/14 [05:25<09:36, 64.01s/it]

5 10000 12000


 43%|████▎     | 6/14 [06:37<08:53, 66.74s/it]

6 12000 14000


 50%|█████     | 7/14 [07:53<08:08, 69.79s/it]

7 14000 16000


 57%|█████▋    | 8/14 [09:36<08:02, 80.37s/it]

8 16000 18000


 64%|██████▍   | 9/14 [11:13<07:07, 85.43s/it]

9 18000 20000


 71%|███████▏  | 10/14 [12:09<05:05, 76.44s/it]

10 20000 22000


 79%|███████▊  | 11/14 [13:03<03:28, 69.62s/it]

11 22000 24000


 86%|████████▌ | 12/14 [14:03<02:12, 66.50s/it]

12 24000 26000


 93%|█████████▎| 13/14 [14:59<01:03, 63.42s/it]

13 26000 28000


100%|██████████| 14/14 [15:26<00:00, 66.18s/it]


In [3]:
country = 'SA'
path_tw, base, rand, baseline, baseline2, agg, agg_base = get_path(country, 'march')
onlyfiles = [f for f in listdir(baseline2) if isfile(join(baseline2, f))]
onlyfiles = [f.replace('.parquet', '') for f in onlyfiles]
onlyfiles.sort()
len(onlyfiles)

0

In [4]:
agg

'../../data/03-experiment/SA/treatment/followers/01-preprocess/'

In [24]:
country = 'KE'
path_tw, base, rand, baseline, baseline2, agg, agg_base = get_path(country, 'march')
onlyfiles = [f for f in listdir(baseline2) if isfile(join(baseline2, f))]
onlyfiles = [f.replace('.parquet', '') for f in onlyfiles]
onlyfiles.sort()
len(onlyfiles)

for i in tqdm(range(0, 14)):
    df_final = pd.DataFrame()
    a = 2000*i 
    b = 2000*(i+1)
    #time.sleep(5)
    for e in onlyfiles[a:b]:
        df = pd.read_parquet(f'{baseline2}{e}.parquet')
        df_final = pd.concat([df_final, df])
    df_final = df_final.reset_index(drop=True)
    df_final['text'] = df_final['text'].astype(str)
    df_final['has_text'] = df_final['text'].apply(has_text)
    # Check if the 'Name' column contains words by splitting and checking the resulting list length
    df_final['has_words'] = df_final['text'].str.split().apply(lambda x: len(x) > 0)
    df_final = compute_engagement_tw2(df_final)
    print(i, a, b)
    df_final.to_parquet(f'{agg_base}intermediate/baseline2_batch2_{i}.parquet.gzip', 
                        compression = 'gzip')

  0%|          | 0/14 [00:00<?, ?it/s]

0 0 2000


  7%|▋         | 1/14 [00:51<11:12, 51.74s/it]

1 2000 4000


 14%|█▍        | 2/14 [01:49<11:03, 55.27s/it]

2 4000 6000


 21%|██▏       | 3/14 [02:46<10:16, 56.06s/it]

3 6000 8000


 29%|██▊       | 4/14 [03:49<09:48, 58.90s/it]

4 8000 10000


 36%|███▌      | 5/14 [05:02<09:33, 63.73s/it]

5 10000 12000


 43%|████▎     | 6/14 [06:15<08:56, 67.12s/it]

6 12000 14000


 50%|█████     | 7/14 [07:30<08:07, 69.60s/it]

7 14000 16000


 57%|█████▋    | 8/14 [09:06<07:49, 78.18s/it]

8 16000 18000


 64%|██████▍   | 9/14 [10:32<06:43, 80.63s/it]

9 18000 20000


 71%|███████▏  | 10/14 [11:52<05:21, 80.27s/it]

10 20000 22000


 79%|███████▊  | 11/14 [12:54<03:43, 74.59s/it]

11 22000 24000


 86%|████████▌ | 12/14 [13:57<02:22, 71.24s/it]

12 24000 26000


 93%|█████████▎| 13/14 [15:07<01:10, 70.72s/it]

13 26000 28000


100%|██████████| 14/14 [16:14<00:00, 69.59s/it]


In [None]:
country = 'SA'
path_tw, base, rand, baseline, baseline2, agg, agg_base = get_path(country, 'march')
onlyfiles = [f for f in listdir(baseline2) if isfile(join(baseline2, f))]
onlyfiles = [f.replace('.parquet', '') for f in onlyfiles]
onlyfiles.sort()
len(onlyfiles)

for i in tqdm(range(0, 14)):
    df_final = pd.DataFrame()
    a = 2000*i 
    b = 2000*(i+1)
    #time.sleep(5)
    for e in onlyfiles[a:b]:
        df = pd.read_parquet(f'{baseline2}{e}.parquet')
        df_final = pd.concat([df_final, df])
    df_final = df_final.reset_index(drop=True)
    df_final['text'] = df_final['text'].astype(str)
    df_final['has_text'] = df_final['text'].apply(has_text)
    # Check if the 'Name' column contains words by splitting and checking the resulting list length
    df_final['has_words'] = df_final['text'].str.split().apply(lambda x: len(x) > 0)
    df_final = compute_engagement_tw2(df_final)
    print(i, a, b)
    df_final.to_parquet(f'{agg}intermediate/baseline2_batch2_{i}.parquet.gzip', 
                        compression = 'gzip')

# Endline

In [3]:
country = 'KE'
path_tw, base, rand, baseline, baseline2, agg, agg_base = get_path(country, 'april')
path_tw1 = f'../../data/03-experiment/{country}/treatment/followers/00-raw/tweets_batch2/may/'
onlyfiles = [f for f in listdir(path_tw1) if isfile(join(path_tw1, f))]
onlyfiles = [f.replace('.parquet', '') for f in onlyfiles]
onlyfiles.sort()

for i in tqdm(range(0, 9)):
    df_final = pd.DataFrame()
    a = 3000*i 
    b = 3000*(i+1)
    #time.sleep(5)
    for e in onlyfiles[a:b]:
        df = pd.read_parquet(f'{path_tw1}{e}.parquet')
        df_final = pd.concat([df_final, df])
    df_final = df_final.reset_index(drop=True)
    df_final['text'] =df_final['text'].astype(str)
    df_final['lang2'] = df_final['text'].apply(lang_detect_na)
    df_final = compute_engagement_tw2(df_final)
    print(i, a, b)
    df_final.to_parquet(f'{agg}intermediate/may_batch2{i}.parquet.gzip', 
                        compression = 'gzip')

19178

In [None]:
country = 'SA'
path_tw, base, rand, baseline, baseline2, agg, agg_base = get_path(country, 'april')
path_tw1 = f'../../data/03-experiment/{country}/treatment/followers/00-raw/tweets_batch2/may/'
onlyfiles = [f for f in listdir(path_tw1) if isfile(join(path_tw1, f))]
onlyfiles = [f.replace('.parquet', '') for f in onlyfiles]
onlyfiles.sort()

for i in tqdm(range(0, 7)):
    df_final = pd.DataFrame()
    a = 3000*i 
    b = 3000*(i+1)
    #time.sleep(5)
    for e in onlyfiles[a:b]:
        df = pd.read_parquet(f'{path_tw1}{e}.parquet')
        df_final = pd.concat([df_final, df])
    df_final = df_final.reset_index(drop=True)
    df_final['text'] =df_final['text'].astype(str)
    df_final['lang2'] = df_final['text'].apply(lang_detect_na)
    df_final = compute_engagement_tw2(df_final)
    print(i, a, b)
    df_final.to_parquet(f'{agg}intermediate/may_batch2{i}.parquet.gzip', 
                        compression = 'gzip')