In [1]:
import pandas as pd
import os
import glob
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category = FutureWarning)
from tqdm import tqdm

def get_path(country, week = 'march'):
    base = f'../../data/03-experiment/{country}/'
    path_tw = base + f'treatment/followers/00-raw/tweets/{week}/'
    rand = f'../../data/02-randomize/{country}/04-stratification/integrate/followers_randomized.parquet'
    baseline = base + 'baseline/00-raw/followers/tweets/'
    agg = base + f'treatment/followers/01-preprocess/'
    agg_base = base + 'baseline/01-preprocess/followers/'
    return path_tw, base, rand, baseline, agg, agg_base

def fake_aggregation(df):
    df.rename(columns = {'handle': 'username', 
                         'author_id':'follower_id'}, inplace = True)

    df = df[['follower_id', 'username', 'text', 'total_comments', 
                    'verifiability', 'true']]
    
    df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
    df['fake'] = np.where(df['true'] == 1, 0, 1)
    df['fake'] = np.where(df['true'].isnull(), np.nan, df['fake'])
    
    df_RT = df[(df['text'].str.contains('RT @', case=True, regex=False)) & 
                 (df['total_comments'] == 0)].reset_index(drop=True)

    df_rt_agg = df_RT[['username', 'follower_id', 
                       'fake', 'verifiability', 'true']].groupby(['username', 
                                             'follower_id']).sum().reset_index()
    
    df_rt_count = df_RT[['username', 'follower_id']].groupby(['follower_id']).count()
    df_rt_count.rename({'username': 'n_posts'}, axis=1, inplace=True)
    
    df_rt_agg = df_rt_agg.merge(df_rt_count, on = 'follower_id', how = 'left')
    
    df_rt_agg.rename(columns = {col: col + '_rt_base' for col
                                in df_rt_agg.columns if col in ['fake', 'n_posts', 
                                                                'verifiability', 'true']}, 
                     inplace=True)
    
    df_no_rt = df[(~df['text'].str.contains('RT @', 
                    case=True, regex=False)) | 
                    (df['total_comments'] > 0)].reset_index(drop=True)
    
    df_no_rt_agg = df_no_rt[['username', 'follower_id', 
                             'fake', 'verifiability', 'true']].groupby(['username', 
                                                'follower_id']).sum().reset_index()
    
    df_no_rt_count = df_no_rt[['username', 'follower_id']].groupby(['follower_id']).count()
    df_no_rt_count.rename({'username': 'n_posts'}, axis=1, inplace=True)
    
    df_no_rt_agg = df_no_rt_agg.merge(df_no_rt_count, on = 'follower_id', how = 'left')
    
    df_no_rt_agg.rename(columns = {col: col + '_no_rt_base' for col 
                                   in df_no_rt_agg.columns if col in ['fake', 'verifiability',
                                                                      'true', 'n_posts']}, 
                        inplace=True)
    
    return df_rt_agg, df_no_rt_agg


In [2]:
country = 'KE'
path_tw, base, rand, baseline, agg, agg_base = get_path(country, 'march')

if country == 'KE':
    n = 84
else:
    n = 74
    
df_final = pd.DataFrame()
for i in tqdm(range(0, n)):
    df = pd.read_parquet(f'{agg_base}predicted/baseline_{i}.parquet.gzip')
    df_final = pd.concat([df_final, df]).reset_index(drop=True)

first_month = df_final[(df_final['created_at'] > '2023-02-10')]
second_month = df_final[(df_final['created_at'] > '2023-01-10')]

base = pd.read_parquet(f'../../data/04-analysis/{country}/baseline_features.parquet')

base = base[['follower_id', 'username']]

df_rt_agg, df_no_rt_agg = fake_aggregation(first_month)

base = base.merge(df_rt_agg, on=['follower_id', 'username'], 
                  how='left')
    
base = base.merge(df_no_rt_agg, on=['follower_id', 'username'], 
                  how='left')
    
cols_base = [col for col in base.columns if '_base' in col]

for x in cols_base:
    base[x] = np.where(base[x].isnull(), 0, base[x])
    
base.rename(columns = {col: col + '_1_month' for col 
                                   in base.columns if col in cols_base}, 
                        inplace=True)

base1 = pd.read_parquet(f'../../data/04-analysis/{country}/baseline_features.parquet')

base1 = base1[['follower_id', 'username']]

df_rt_agg, df_no_rt_agg = fake_aggregation(second_month)

base1 = base1.merge(df_rt_agg, on=['follower_id', 'username'], 
                  how='left')
    
base1 = base1.merge(df_no_rt_agg, on=['follower_id', 'username'], 
                  how='left')
    
cols_base = [col for col in base1.columns if '_base' in col]

for x in cols_base:
    base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
base1.rename(columns = {col: col + '_2_month' for col 
                                   in base1.columns if col in cols_base}, 
                        inplace=True)

base = base.merge(base1, on = ['follower_id', 'username'],
                  how = 'left')
base.to_parquet(f'../../data/04-analysis/{country}/baseline_months.parquet')

100%|██████████████████████████████████████████████████████████████████████████████████| 84/84 [13:00<00:00,  9.30s/it]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns = {'handle': 'username',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns = {'handle': 'username',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'] == 1, 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer]

In [10]:
base

Unnamed: 0,follower_id,username,fake_rt_base_1_month,verifiability_rt_base_1_month,true_rt_base_1_month,n_posts_rt_base_1_month,fake_no_rt_base_1_month,verifiability_no_rt_base_1_month,true_no_rt_base_1_month,n_posts_no_rt_base_1_month
0,1000382380618801154,SibuM15,0.0,1.0,1.0,11.0,0.0,0.0,0.0,1.0
1,1000462799351570432,aaron_lebea,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,1000469765381472256,MusheerHasan1,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
3,1001016845976133632,KO_Tsomele,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,1001138483216437248,MagalaleSebati,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
44196,730441406,RomeoMasiya,1.0,1.0,0.0,4.0,2.0,3.0,1.0,109.0
44197,765123456,Queertified,5.0,8.0,3.0,47.0,1.0,1.0,0.0,11.0
44198,788950034,Lubabalo_K,13.0,15.0,2.0,56.0,4.0,4.0,0.0,52.0
44199,803124849514336256,petrohlee,1.0,1.0,0.0,1.0,0.0,0.0,0.0,7.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns = {'handle': 'username',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['true'] = np.where(df['verifiability'] == 0, np.nan, df['true'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fake'] = np.where(df['true'] == 1, 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer]

Unnamed: 0,follower_id,username,fake_rt_base_2_month,verifiability_rt_base_2_month,true_rt_base_2_month,n_posts_rt_base_2_month,fake_no_rt_base_2_month,verifiability_no_rt_base_2_month,true_no_rt_base_2_month,n_posts_no_rt_base_2_month
0,1000382380618801154,SibuM15,0.0,1.0,1.0,17.0,0.0,0.0,0.0,1.0
1,1000462799351570432,aaron_lebea,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.0
2,1000469765381472256,MusheerHasan1,0.0,7.0,7.0,8.0,0.0,2.0,2.0,6.0
3,1001016845976133632,KO_Tsomele,1.0,2.0,1.0,23.0,0.0,0.0,0.0,5.0
4,1001138483216437248,MagalaleSebati,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
44196,730441406,RomeoMasiya,5.0,5.0,0.0,13.0,9.0,11.0,2.0,272.0
44197,765123456,Queertified,8.0,12.0,4.0,75.0,1.0,1.0,0.0,14.0
44198,788950034,Lubabalo_K,23.0,33.0,10.0,102.0,6.0,6.0,0.0,94.0
44199,803124849514336256,petrohlee,1.0,1.0,0.0,3.0,0.0,0.0,0.0,11.0


Unnamed: 0,follower_id,username,fake_rt_base_1_month,verifiability_rt_base_1_month,true_rt_base_1_month,n_posts_rt_base_1_month,fake_no_rt_base_1_month,verifiability_no_rt_base_1_month,true_no_rt_base_1_month,n_posts_no_rt_base_1_month,fake_rt_base_2_month,verifiability_rt_base_2_month,true_rt_base_2_month,n_posts_rt_base_2_month,fake_no_rt_base_2_month,verifiability_no_rt_base_2_month,true_no_rt_base_2_month,n_posts_no_rt_base_2_month
0,1000382380618801154,SibuM15,0.0,1.0,1.0,11.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,17.0,0.0,0.0,0.0,1.0
1,1000462799351570432,aaron_lebea,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.0
2,1000469765381472256,MusheerHasan1,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,7.0,7.0,8.0,0.0,2.0,2.0,6.0
3,1001016845976133632,KO_Tsomele,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,23.0,0.0,0.0,0.0,5.0
4,1001138483216437248,MagalaleSebati,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44196,730441406,RomeoMasiya,1.0,1.0,0.0,4.0,2.0,3.0,1.0,109.0,5.0,5.0,0.0,13.0,9.0,11.0,2.0,272.0
44197,765123456,Queertified,5.0,8.0,3.0,47.0,1.0,1.0,0.0,11.0,8.0,12.0,4.0,75.0,1.0,1.0,0.0,14.0
44198,788950034,Lubabalo_K,13.0,15.0,2.0,56.0,4.0,4.0,0.0,52.0,23.0,33.0,10.0,102.0,6.0,6.0,0.0,94.0
44199,803124849514336256,petrohlee,1.0,1.0,0.0,1.0,0.0,0.0,0.0,7.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,11.0
