In [1]:
import pandas as pd
import os
import glob
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category = FutureWarning)
from tqdm import tqdm

def get_path(country, week = 'march'):
    base = f'../../data/03-experiment/{country}/'
    path_tw = base + f'treatment/followers/00-raw/tweets/{week}/'
    rand = f'../../data/02-randomize/{country}/04-stratification/integrate/followers_randomized.parquet'
    baseline = base + 'baseline/00-raw/followers/tweets/'
    agg = base + f'treatment/followers/01-preprocess/'
    agg_base = base + 'baseline/01-preprocess/followers/'
    return path_tw, rand, baseline, agg, agg_base

def summ_followers(df):
    metrics = [col for col in df.columns if 'total_' in col]
    cols = metrics + ['verifiability', 'true']

    df_agg = df[['handle', 'author_id'] + 
        cols].groupby(['handle', 'author_id']).sum().reset_index()

    df_mean = df[['handle', 'author_id'] + 
        cols].groupby(['handle', 'author_id']).mean().reset_index()
    df_mean.rename(columns = 
        {col: col + '_mean' for col in df_mean.columns if col in cols}, 
        inplace=True)

    df_count = df[['handle', 'author_id']].groupby(['author_id']).count()
    df_count.rename({'handle': 'n_posts'}, axis=1, inplace=True)

    df_agg = df_agg.merge(df_mean, on=['handle', 'author_id'], how='left')
    df_agg = df_agg.merge(df_count, on=['author_id'], how='left')

    return df_agg


def summ_followers2(df):
    metrics = [col for col in df.columns if 'total_' in col]
    cols = metrics + ['verifiability', 'true']

    df_agg = df[['handle', 'author_id'] + 
        cols].groupby(['handle', 'author_id']).sum().reset_index()

    df_count = df[['handle', 'author_id']].groupby(['author_id']).count()
    df_count.rename({'handle': 'n_posts'}, axis=1, inplace=True)

    df_agg = df_agg.merge(df_count, on=['author_id'], how='left')

    return df_agg

def fake_aggregation(df):
    df.rename(columns = {'handle': 'username', 
                         'author_id':'follower_id'}, inplace = True)

    df = df[['follower_id', 'username', 'text', 'total_comments', 
                    'verifiability', 'true']]
    
    df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
    df['fake'] = np.where(df['true'] == 1, 0, 1)
    df['fake'] = np.where(df['true'].isnull(), np.nan, df['fake'])
    
    df_RT = df[(df['text'].str.contains('RT @', case=True, regex=False)) & 
                 (df['total_comments'] == 0)].reset_index(drop=True)

    df_rt_agg = df_RT[['username', 'follower_id', 
                       'fake']].groupby(['username', 
                                             'follower_id']).sum().reset_index()
    
    df_rt_agg.rename(columns = {col: col + '_rt_base' for col
                                in df_rt_agg.columns if col in ['fake']}, 
                     inplace=True)
    
    df_no_rt = df[(~df['text'].str.contains('RT @', 
                    case=True, regex=False)) | 
                    (df['total_comments'] > 0)].reset_index(drop=True)
    
    df_no_rt_agg = df_no_rt[['username', 'follower_id', 
                             'fake']].groupby(['username', 
                                                'follower_id']).sum().reset_index()
    
    df_no_rt_agg.rename(columns = {col: col + '_no_rt_base' for col 
                                   in df_no_rt_agg.columns if col in ['fake']}, 
                        inplace=True)
    
    return df_rt_agg, df_no_rt_agg
 
def fake_aggregation_end(df):
    df.rename(columns = {'handle': 'username', 
                         'author_id':'follower_id'}, inplace = True)

    df = df[['follower_id', 'username', 'text', 'total_comments', 
                    'verifiability', 'true']]
    
    df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
    df['fake'] = np.where(df['true'] == 1, 0, 1)
    df['fake'] = np.where(df['true'].isnull(), np.nan, df['fake'])
    
    df_RT = df[(df['text'].str.contains('RT @', case=True, regex=False)) & 
                 (df['total_comments'] == 0)].reset_index(drop=True)

    df_rt_agg = df_RT[['username', 'follower_id', 'verifiability', 'true',
                       'fake']].groupby(['username', 
                                             'follower_id']).sum().reset_index()
    
    df_rt_agg.rename(columns = {col: col + '_rt' for col
                                in df_rt_agg.columns if col in ['fake']}, 
                     inplace=True)
    
    df_no_rt = df[(~df['text'].str.contains('RT @', 
                    case=True, regex=False)) | 
                    (df['total_comments'] > 0)].reset_index(drop=True)
    
    df_no_rt_agg = df_no_rt[['username', 'follower_id', 
                             'fake']].groupby(['username', 
                                                'follower_id']).sum().reset_index()
    
    df_no_rt_agg.rename(columns = {col: col + '_no_rt' for col 
                                   in df_no_rt_agg.columns if col in ['fake']}, 
                        inplace=True)
    
    return df_rt_agg, df_no_rt_agg

In [2]:
country = 'SA'
path_tw, rand, baseline, agg, agg_base = get_path(country, 'march')

base = pd.read_parquet(f'../../data/04-analysis/{country}/baseline_features.parquet')

base = base[['username', 'follower_id', 'ads_treatment', 
                 'strat_block1', 'strat_block2', 'c_t_strong_total', 
                 'c_t_weak_total', 'c_t_neither_total', 't_strong',
                 't_weak', 't_neither']]

df_final = pd.DataFrame()

for i in tqdm(range(0, 74)): 
    df1 = pd.read_parquet(f'{agg_base}predicted/baseline_{i}.parquet.gzip')
    df_final = pd.concat([df_final, df1]).reset_index(drop=True)

df_rt_agg, df_no_rt_agg = fake_aggregation(df_final)

base = base.merge(df_rt_agg, on=['follower_id', 'username'], 
                  how='left')
    
base = base.merge(df_no_rt_agg, on=['follower_id', 'username'], 
                  how='left')
    
cols_base = [col for col in base.columns if '_base' in col]
for x in cols_base:
    base[x] = np.where(base[x].isnull(), 0, base[x])
    
base.to_parquet(f'../../data/04-analysis/{country}/baseline/baseline_fake.parquet', 
               index = False)

country = 'KE'
path_tw, rand, baseline, agg, agg_base = get_path(country, 'march')

base = pd.read_parquet(f'../../data/04-analysis/{country}/baseline_features.parquet')

base = base[['username', 'follower_id', 'ads_treatment', 
             'strat_block1', 'strat_block2', 'c_t_strong_total', 
             'c_t_weak_total', 'c_t_neither_total', 't_strong',
             't_weak', 't_neither']]

df_final = pd.DataFrame()
for i in tqdm(range(0, 84)): 
    df1 = pd.read_parquet(f'{agg_base}predicted/baseline_{i}.parquet.gzip')
    df_final = pd.concat([df_final, df1]).reset_index(drop=True)
    
df_rt_agg, df_no_rt_agg = fake_aggregation(df_final)

base = base.merge(df_rt_agg, on=['follower_id', 'username'], 
                  how='left')
    
base = base.merge(df_no_rt_agg, on=['follower_id', 'username'], 
                  how='left')
    
cols_base = [col for col in base.columns if '_base' in col]
for x in cols_base:
    base[x] = np.where(base[x].isnull(), 0, base[x])
    
base.to_parquet(f'../../data/04-analysis/{country}/baseline/baseline_fake.parquet',
               index = False)

100%|██████████████████████████████████████████████████████████████████████████████████| 74/74 [03:00<00:00,  2.43s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'] == 1, 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retur

### Endline

In [22]:
country = 'KE'
if country == 'SA':
    N_ARCHS = 25
    N_ARCHS1 = 10
    N_ARCHS2 = 10
else:
    N_ARCHS = 58
    N_ARCHS1 = 21
    N_ARCHS2 = 21

In [23]:
path_tw, rand, baseline, agg, agg_base = get_path(country, 'march')

base = pd.read_parquet(f'../../data/04-analysis/{country}/baseline/baseline_fake.parquet')

df_final = pd.DataFrame()
for i in range(0, N_ARCHS):
    df = pd.read_parquet(f'{agg}predicted/march_{i}.parquet.gzip')
    df_final = pd.concat([df_final, df])

df_final = df_final.reset_index(drop=True)

df_final['stage'] = np.where((df_final['created_at'] > '2023-03-13') & (df_final['created_at'] < '2023-03-27'), 
                             1, 2)

stage1 = df_final[df_final['stage'] == 1]

df_rt_agg, df_no_rt_agg = fake_aggregation_end(stage1)

base1 = base.merge(df_rt_agg, on=['follower_id', 'username'], 
                  how='left')
    
base1 = base1.merge(df_no_rt_agg, on=['follower_id', 'username'], 
                  how='left')

cols_base = ([col for col in base1.columns if '_rt' in col] + 
             [col for col in base1.columns if '_no_rt' in col])

for x in cols_base:
    base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
base1.to_parquet(f'../../data/04-analysis/{country}/stage1/endline_fake.parquet', 
                index=False)

stage2 = df_final[df_final['stage'] == 2]

df_rt_agg, df_no_rt_agg = fake_aggregation_end(stage2)

base1 = base.merge(df_rt_agg, on=['follower_id', 'username'], 
                  how='left')
    
base1 = base1.merge(df_no_rt_agg, on=['follower_id', 'username'], 
                  how='left')

cols_base = ([col for col in base1.columns if '_rt' in col] + 
             [col for col in base1.columns if '_no_rt' in col])

for x in cols_base:
    base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
base1.to_parquet(f'../../data/04-analysis/{country}/stage2/endline_fake.parquet', 
                index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns = {'handle': 'username',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'] == 1, 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer]

In [24]:
base1

Unnamed: 0,username,follower_id,ads_treatment,strat_block1,strat_block2,c_t_strong_total,c_t_weak_total,c_t_neither_total,t_strong,t_weak,t_neither,fake_rt_base,fake_no_rt_base,fake_rt,fake_no_rt
0,FestusOntita,1000259098716368898,0.0,60911111111,142711111111,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,MNgats,1000280756856545282,1.0,95911111111,220411111111,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,RonnyKipkosgei,1000488233824866304,1.0,97411111111,223711111111,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,mwaura__,1000507757915787264,1.0,97011111111,223011111111,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,firas_khouri,1001043873483952129,1.0,76911111111,178111111111,0,1,0,0.0,0.0,0.0,7.0,2.0,5.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102529,WilliamOnoka,997713498,0.0,122222222222,50022222222,0,2,6,0.0,1.0,0.0,0.0,0.0,0.0,0.0
102530,MatataAmbrose,998604774,1.0,152422222222,117022222222,0,4,7,0.0,2.0,1.0,1.0,0.0,0.0,0.0
102531,Sam_Mgc,999183930,0.0,88922222222,304022222222,0,4,6,0.0,2.0,3.0,0.0,1.0,0.0,1.0
102532,kwalimwadavid,99934705,1.0,109922222222,22322222222,1,1,4,0.0,1.0,0.0,11.0,5.0,10.0,1.0


In [25]:
path_tw, rand, baseline, agg, agg_base = get_path(country, 'april')

df_final = pd.DataFrame()
for i in range(0, N_ARCHS2):
    df = pd.read_parquet(f'{agg}predicted/april1_good{i}.parquet.gzip')
    df_final = pd.concat([df_final, df])

df_final = df_final.reset_index(drop=True)
df_final['handle'] = df_final['author_id']

df_rt_agg, df_no_rt_agg = fake_aggregation_end(df_final)
df_rt_agg = df_rt_agg.drop(['username'], axis=1)
df_no_rt_agg = df_no_rt_agg.drop(['username'], axis=1)

base1 = base.merge(df_rt_agg, on=['follower_id'], 
                  how='left')
    
base1 = base1.merge(df_no_rt_agg, on=['follower_id'], 
                  how='left')

cols_base = ([col for col in base1.columns if '_rt' in col] + 
             [col for col in base1.columns if '_no_rt' in col])

for x in cols_base:
    base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
base1.to_parquet(f'../../data/04-analysis/{country}/stage3/endline_fake.parquet', 
                index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'] == 1, 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'].isnull(), np.nan, df['fake'])


In [26]:
path_tw, rand, baseline, agg, agg_base = get_path(country, 'april')

df_final = pd.DataFrame()
for i in range(0, N_ARCHS1):
    df = pd.read_parquet(f'{agg}predicted/april2_{i}.parquet.gzip')
    df_final = pd.concat([df_final, df])

df_final = df_final.reset_index(drop=True)
df_final['handle'] = df_final['author_id']

df_rt_agg, df_no_rt_agg = fake_aggregation_end(df_final)
df_rt_agg = df_rt_agg.drop(['username'], axis=1)
df_no_rt_agg = df_no_rt_agg.drop(['username'], axis=1)

base1 = base.merge(df_rt_agg, on=['follower_id'], 
                  how='left')
    
base1 = base1.merge(df_no_rt_agg, on=['follower_id'], 
                  how='left')

cols_base = ([col for col in base1.columns if '_rt' in col] + 
             [col for col in base1.columns if '_no_rt' in col])

for x in cols_base:
    base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
base1.to_parquet(f'../../data/04-analysis/{country}/stage4/endline_fake.parquet', 
                index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'] == 1, 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'].isnull(), np.nan, df['fake'])


In [27]:
path_tw, rand, baseline, agg, agg_base = get_path(country, 'april')

df_final = pd.DataFrame()
for i in range(0, N_ARCHS2):
    df = pd.read_parquet(f'{agg}predicted/posttreat_{i}.parquet.gzip')
    df_final = pd.concat([df_final, df])

df_final = df_final.reset_index(drop=True)
df_final['handle'] = df_final['author_id']

df_rt_agg, df_no_rt_agg = fake_aggregation_end(df_final)
df_rt_agg = df_rt_agg.drop(['username'], axis=1)
df_no_rt_agg = df_no_rt_agg.drop(['username'], axis=1)

base1 = base.merge(df_rt_agg, on=['follower_id'], 
                  how='left')
    
base1 = base1.merge(df_no_rt_agg, on=['follower_id'], 
                  how='left')

cols_base = ([col for col in base1.columns if '_rt' in col] + 
             [col for col in base1.columns if '_no_rt' in col])

for x in cols_base:
    base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
base1.to_parquet(f'../../data/04-analysis/{country}/stage5/endline_fake.parquet', 
                index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'] == 1, 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'].isnull(), np.nan, df['fake'])


In [28]:
path_tw, rand, baseline, agg, agg_base = get_path(country, 'april')

df_final = pd.DataFrame()
for i in range(0, N_ARCHS2):
    df = pd.read_parquet(f'{agg}predicted/posttreat2_{i}.parquet.gzip')
    df_final = pd.concat([df_final, df])

df_final = df_final.reset_index(drop=True)
df_final['handle'] = df_final['author_id']

df_rt_agg, df_no_rt_agg = fake_aggregation_end(df_final)
df_rt_agg = df_rt_agg.drop(['username'], axis=1)
df_no_rt_agg = df_no_rt_agg.drop(['username'], axis=1)

base1 = base.merge(df_rt_agg, on=['follower_id'], 
                  how='left')
    
base1 = base1.merge(df_no_rt_agg, on=['follower_id'], 
                  how='left')

cols_base = ([col for col in base1.columns if '_rt' in col] + 
             [col for col in base1.columns if '_no_rt' in col])

for x in cols_base:
    base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
base1.to_parquet(f'../../data/04-analysis/{country}/stage6/endline_fake.parquet', 
                index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'] == 1, 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'].isnull(), np.nan, df['fake'])


## Batch 2

In [32]:
# Change Country here
country = 'KE'

if country == 'KE':
    n_end = 9
    n_base = 14
else:
    n_end = 7

In [34]:
#KE
path_tw, rand, baseline, agg, agg_base = get_path(country, 'march')
base = pd.read_parquet(f'../../data/04-analysis/{country}/baseline_batch2.parquet')

df_final = pd.DataFrame()
for i in tqdm(range(0, n_base)):
    df = pd.read_parquet(f'{agg_base}predicted/baseline_batch2_0{i}.parquet.gzip')
    df_final = pd.concat([df_final, df])

df_final1 = pd.DataFrame()
for i in tqdm(range(0, n_base)):
    df1 = pd.read_parquet(f'{agg_base}predicted/baseline2_batch2_{i}.parquet.gzip')
    df_final1 = pd.concat([df_final1, df1])

df_final = pd.concat([df_final, df_final1]).reset_index(drop=True)
df_final['handle'] = df_final['author_id']

df_rt_agg, df_no_rt_agg = fake_aggregation(df_final)

df_rt_agg = df_rt_agg.drop(['username'], axis=1)
df_no_rt_agg = df_no_rt_agg.drop(['username'], axis=1)

base = base.merge(df_rt_agg, on=['follower_id'], 
                  how='left')
    
base = base.merge(df_no_rt_agg, on=['follower_id'], 
                  how='left')
    
cols_base = [col for col in base.columns if '_base' in col]
for x in cols_base:
    base[x] = np.where(base[x].isnull(), 0, base[x])
    
base.to_parquet(f'../../data/04-analysis/{country}/baseline/baseline_fake_batch2.parquet', 
               index = False)

100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:24<00:00,  1.76s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:25<00:00,  1.81s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'] == 1, 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

In [35]:
# Change Country here
country = 'SA'

if country == 'KE':
    n_end = 9
    n_base = 14
else:
    n_end = 7

In [37]:
# SA
path_tw, rand, baseline, agg, agg_base = get_path(country, 'march')
base = pd.read_parquet(f'../../data/04-analysis/{country}/baseline_batch2.parquet')

df_final = pd.DataFrame()
for i in tqdm(range(0, 10)):
    df = pd.read_parquet(f'{agg_base}predicted/baseline_batch2_{i}.parquet.gzip')
    df_final = pd.concat([df_final, df])

df_final = df_final.reset_index(drop=True)

df_final = pd.concat([df_final, df_final1]).reset_index(drop=True)
df_final['handle'] = df_final['author_id']

df_rt_agg, df_no_rt_agg = fake_aggregation(df_final)

df_rt_agg = df_rt_agg.drop(['username'], axis=1)
df_no_rt_agg = df_no_rt_agg.drop(['username'], axis=1)

base = base.merge(df_rt_agg, on=['follower_id'], 
                  how='left')
    
base = base.merge(df_no_rt_agg, on=['follower_id'], 
                  how='left')
    
cols_base = [col for col in base.columns if '_base' in col]
for x in cols_base:
    base[x] = np.where(base[x].isnull(), 0, base[x])
    
base.to_parquet(f'../../data/04-analysis/{country}/baseline/baseline_fake_batch2.parquet', 
               index = False)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:11<00:00,  1.17s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'] == 1, 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retur

In [42]:
country = 'KE'
if country == 'KE':
    n_end = 9
    n_base = 14
else:
    n_end = 7
path_tw, rand, baseline, agg, agg_base = get_path(country, 'march')    
df_final = pd.DataFrame()
base = pd.read_parquet(f'../../data/04-analysis/{country}/baseline/baseline_fake_batch2.parquet')

for i in range(0, n_end):
    df = pd.read_parquet(f'{agg}predicted/may_batch2{i}.parquet.gzip')
    df_final = pd.concat([df_final, df])

df_final = df_final.reset_index(drop=True)
df_final['handle'] = df_final['author_id']

df_final['stage'] = np.where((df_final['created_at'] > '2023-04-31') & 
                             (df_final['created_at'] < '2023-05-15'), 
                             1, 2)

stage1 = df_final[df_final['stage'] == 1]

df_rt_agg, df_no_rt_agg = fake_aggregation_end(stage1)

df_rt_agg = df_rt_agg.drop(['username'], axis=1)
df_no_rt_agg = df_no_rt_agg.drop(['username'], axis=1)

base1 = base.merge(df_rt_agg, on=['follower_id'], 
                  how='left')

base1 = base1.merge(df_no_rt_agg, on=['follower_id'], 
                  how='left')

cols_base = ([col for col in base1.columns if '_rt' in col] + 
             [col for col in base1.columns if '_no_rt' in col])

for x in cols_base:
    base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
base1.to_parquet(f'../../data/04-analysis/{country}/stage1/endline_fake_batch2.parquet', 
               index = False)

stage2 = df_final[df_final['stage'] == 1]

df_rt_agg, df_no_rt_agg = fake_aggregation_end(stage2)

df_rt_agg = df_rt_agg.drop(['username'], axis=1)
df_no_rt_agg = df_no_rt_agg.drop(['username'], axis=1)

base1 = base.merge(df_rt_agg, on=['follower_id'], 
                  how='left')

base1 = base1.merge(df_no_rt_agg, on=['follower_id'], 
                  how='left')

cols_base = ([col for col in base1.columns if '_rt' in col] + 
             [col for col in base1.columns if '_no_rt' in col])

for x in cols_base:
    base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
base1.to_parquet(f'../../data/04-analysis/{country}/stage2/endline_fake_batch2.parquet', 
               index = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns = {'handle': 'username',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'] == 1, 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer]

### Pilot:

In [2]:
#country = 'SA'
country = 'KE'

if country == 'KE':
    n_base1 = 95
    n_base2 = 37
else:
    n_base1 = 156
    n_base2 = 26
    
agg_base = f'../../../social-media-influencers-africa/data/03-experiment/{country}/baseline/01-preprocess/followers/'

df_base1 = pd.DataFrame()
for i in tqdm(range(0, n_base1)):
    df = pd.read_parquet(f'{agg_base}features/baseline_{i}.parquet.gzip')
    df_base1 = pd.concat([df_base1, df])

df_base2 = pd.DataFrame()
for i in tqdm(range(0, n_base2)):
    df = pd.read_parquet(f'{agg_base}features_abs/baseline_{i}.parquet.gzip')
    df_base2 = pd.concat([df_base2, df])
    
df_base = pd.concat([df_base1, df_base2]).reset_index(drop= True)

df_base1 = pd.DataFrame()
df_base2 = pd.DataFrame()

df_rt_agg, df_no_rt_agg = fake_aggregation(df_base)

base = pd.read_parquet(f'../../../social-media-influencers-africa/data/04-analysis/{country}/baseline_features_winsor.parquet')

base = base[['username', 'follower_id', 'ads_treatment', 
             'strat_block1', 'strat_block2', 'c_t_strong_total', 
             'c_t_weak_total', 'c_t_neither_total', 't_strong',
             't_weak', 't_neither']]

base = base.merge(df_rt_agg, on=['follower_id', 'username'], 
                  how='left')

base = base.merge(df_no_rt_agg, on=['follower_id', 'username'], 
                  how='left')

cols_base = [col for col in base.columns if '_base' in col]
for x in cols_base:
    base[x] = np.where(base[x].isnull(), 0, base[x])
    
base.to_parquet(f'../../data/04-analysis/{country}/baseline/baseline_fake_pilot.parquet')

100%|██████████████████████████████████████████████████████████████████████████████████| 95/95 [03:09<00:00,  1.99s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 37/37 [00:17<00:00,  2.17it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'] == 1, 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

In [8]:
country = 'SA'
#country = 'KE'

In [9]:
base1 = f'../../../social-media-influencers-africa/data/03-experiment/{country}/'
agg = base1 + 'treatment/followers/01-preprocess/'

## Import baseline:
base = pd.read_parquet(f'../../data/04-analysis/{country}/baseline/baseline_fake_pilot.parquet')

# Import endline data
if country == 'KE':
    stage = pd.concat([pd.read_parquet(f'{agg}endline/december0.parquet.gzip'),
                       pd.read_parquet(f'{agg}endline/december0_abs.parquet.gzip')]).reset_index(drop = True)
    
else: 
    stage = pd.concat([pd.read_parquet(f'{agg}endline/december0_1.parquet.gzip'),
                       pd.read_parquet(f'{agg}endline/december0_2.parquet.gzip'),
                       pd.read_parquet(f'{agg}endline/december0_abs.parquet.gzip')]).reset_index(drop = True)
    

df_rt_agg, df_no_rt_agg = fake_aggregation_end(stage)

base = base.merge(df_rt_agg, on=['follower_id', 'username'], 
                  how='left')

base = base.merge(df_no_rt_agg, on=['follower_id', 'username'], 
                  how='left')


cols_base = ([col for col in base.columns if '_rt' in col] + 
             [col for col in base.columns if '_no_rt' in col])

for x in cols_base:
    base[x] = np.where(base[x].isnull(), 0, base[x])
    
base.to_parquet(f'../../data/04-analysis/{country}/stage1/endline_fake_pilot.parquet', 
                index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'] == 1, 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'].isnull(), np.nan, df['fake'])


In [10]:
base1 = f'../../../social-media-influencers-africa/data/03-experiment/{country}/'
agg = base1 + 'treatment/followers/01-preprocess/'

## Import baseline:
base = pd.read_parquet(f'../../data/04-analysis/{country}/baseline/baseline_fake_pilot.parquet')

# Import endline data
if country == 'KE':
    stage = pd.concat([pd.read_parquet(f'{agg}endline/december1.parquet.gzip'),
                       pd.read_parquet(f'{agg}endline/december1_abs.parquet.gzip')]).reset_index(drop = True)
    
else: 
    stage = pd.concat([pd.read_parquet(f'{agg}endline/december1_1.parquet.gzip'),
                       pd.read_parquet(f'{agg}endline/december1_2.parquet.gzip'),
                       pd.read_parquet(f'{agg}endline/december1_3.parquet.gzip'),
                       pd.read_parquet(f'{agg}endline/december1_4.parquet.gzip'),
                       pd.read_parquet(f'{agg}endline/december1_abs.parquet.gzip')]).reset_index(drop = True)
    
df_rt_agg, df_no_rt_agg = fake_aggregation_end(stage)

base = base.merge(df_rt_agg, on=['follower_id', 'username'], 
                  how='left')

base = base.merge(df_no_rt_agg, on=['follower_id', 'username'], 
                  how='left')

cols_base = ([col for col in base.columns if '_rt' in col] + 
             [col for col in base.columns if '_no_rt' in col])

for x in cols_base:
    base[x] = np.where(base[x].isnull(), 0, base[x])
    
base.to_parquet(f'../../data/04-analysis/{country}/stage2/endline_fake_pilot.parquet', 
                index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'] == 1, 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'].isnull(), np.nan, df['fake'])


In [11]:
base1 = f'../../../social-media-influencers-africa/data/03-experiment/{country}/'
agg = base1 + 'treatment/followers/01-preprocess/'

## Import baseline:
base = pd.read_parquet(f'../../data/04-analysis/{country}/baseline/baseline_fake_pilot.parquet')

# Import endline data
if country == 'KE':
    stage = pd.concat([pd.read_parquet(f'{agg}endline/january0.parquet.gzip'),
                       pd.read_parquet(f'{agg}endline/january0_abs.parquet.gzip')]).reset_index(drop = True)
    
else: 
    stage = pd.concat([pd.read_parquet(f'{agg}endline/january0_1.parquet.gzip'),
                       pd.read_parquet(f'{agg}endline/january0_2.parquet.gzip'),
                       pd.read_parquet(f'{agg}endline/january0_abs.parquet.gzip')]).reset_index(drop = True)
    
df_rt_agg, df_no_rt_agg = fake_aggregation_end(stage)

base = base.merge(df_rt_agg, on=['follower_id', 'username'], 
                  how='left')

base = base.merge(df_no_rt_agg, on=['follower_id', 'username'], 
                  how='left')

cols_base = ([col for col in base.columns if '_rt' in col] + 
             [col for col in base.columns if '_no_rt' in col])

for x in cols_base:
    base[x] = np.where(base[x].isnull(), 0, base[x])
    
base.to_parquet(f'../../data/04-analysis/{country}/stage3/endline_fake_pilot.parquet', 
                index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'] == 1, 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'].isnull(), np.nan, df['fake'])


In [12]:
base1 = f'../../../social-media-influencers-africa/data/03-experiment/{country}/'
agg = base1 + 'treatment/followers/01-preprocess/'

## Import baseline:
base = pd.read_parquet(f'../../data/04-analysis/{country}/baseline/baseline_fake_pilot.parquet')

# Import endline data
if country == 'KE':
    stage = pd.concat([pd.read_parquet(f'{agg}endline/january1.parquet.gzip'),
                       pd.read_parquet(f'{agg}endline/january1_abs.parquet.gzip')]).reset_index(drop = True)
    
else: 
    stage = pd.concat([pd.read_parquet(f'{agg}endline/january1_1.parquet.gzip'),
                       pd.read_parquet(f'{agg}endline/january1_2.parquet.gzip'),
                       pd.read_parquet(f'{agg}endline/january1_abs.parquet.gzip')]).reset_index(drop = True)
    
df_rt_agg, df_no_rt_agg = fake_aggregation_end(stage)

base = base.merge(df_rt_agg, on=['follower_id', 'username'], 
                  how='left')

base = base.merge(df_no_rt_agg, on=['follower_id', 'username'], 
                  how='left')

cols_base = ([col for col in base.columns if '_rt' in col] + 
             [col for col in base.columns if '_no_rt' in col])

for x in cols_base:
    base[x] = np.where(base[x].isnull(), 0, base[x])
    
base.to_parquet(f'../../data/04-analysis/{country}/stage4/endline_fake_pilot.parquet', 
                index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'] == 1, 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'].isnull(), np.nan, df['fake'])


In [14]:
base

Unnamed: 0,username,follower_id,ads_treatment,strat_block1,strat_block2,c_t_strong_total,c_t_weak_total,c_t_neither_total,t_strong,t_weak,t_neither,fake_rt_base,fake_no_rt_base,fake_rt,fake_no_rt
0,lethoria,1270082106719379457,0.0,9314114123,25214114123,0,1,0,0.0,1.0,0.0,9.0,0.0,0.0,0.0
1,NyachakiDaniel1,952039980788060161,0.0,3514114123,12414114123,0,1,0,0.0,1.0,0.0,4.0,0.0,0.0,0.0
2,PsyRuzHybrid,847242128,0.0,1341141144,1041141144,1,0,0,1.0,0.0,0.0,0.0,2.0,0.0,1.0
3,ryangosha,128179177,1.0,17614114144,17214114144,0,1,0,0.0,1.0,0.0,30.0,67.0,0.0,0.0
4,MoMoremi,352603364,0.0,14114143,14114143,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99376,Armand59406087,1159527757866512384,0.0,210311111121,245711111121,0,0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0
99377,NnaPalesaEntle,1200638749744521217,0.0,12433433423,50633433423,0,0,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0
99378,Fezie_Ngidi,1402834642626039808,1.0,17044344314,16044344314,0,0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0
99379,PrinsEverol,1458730723003744257,1.0,26222222212,206022222212,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
