In [1]:
import pandas as pd
import os
import glob
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category = FutureWarning)
from tqdm import tqdm

def get_path(country, week = 'march'):
    base = f'../../data/03-experiment/{country}/'
    path_tw = base + f'treatment/followers/00-raw/tweets/{week}/'
    rand = f'../../data/02-randomize/{country}/04-stratification/integrate/followers_randomized.parquet'
    baseline = base + 'baseline/00-raw/followers/tweets/'
    agg = base + f'treatment/followers/01-preprocess/'
    agg_base = base + 'baseline/01-preprocess/followers/'
    return path_tw, base, rand, baseline, agg, agg_base


def summ_followers2(df):
    metrics = [col for col in df.columns if 'total_' in col]
    cols = metrics + ['verifiability', 'true']

    df_agg = df[['handle', 'author_id'] + 
        cols].groupby(['handle', 'author_id']).sum().reset_index()

    df_count = df[['handle', 'author_id']].groupby(['author_id']).count()
    df_count.rename({'handle': 'n_posts'}, axis=1, inplace=True)

    df_agg = df_agg.merge(df_count, on=['author_id'], how='left')

    return df_agg

def divide_and_conquer(df_final):
    df_RT = df_final[(df_final['text'].str.contains('RT @', case=True, regex=False)) & 
                 (df_final['total_comments'] == 0)]
    df_RT = df_RT.reset_index(drop=True)

    df_no_rt = df_final[(~df_final['text'].str.contains('RT @', 
                    case=True, regex=False)) | 
                    (df_final['total_comments'] > 0)].reset_index(drop=True)
    
    df_rt_agg = summ_followers2(df_RT).reset_index(drop=True)
    df_no_rt_agg = summ_followers2(df_no_rt).reset_index(drop=True)

    cols = [col for col in df_rt_agg.columns if 'total_' in col] + ['verifiability', 'true', 'n_posts']
    df_rt_agg.rename(columns = 
        {col: col + '_rt' for col in df_rt_agg.columns if col in cols}, 
        inplace=True)

    df_no_rt_agg.rename(columns = 
        {col: col + '_no_rt' for col in df_no_rt_agg.columns if col in cols}, 
        inplace=True)

    df_rt_agg.rename(columns = {'handle': 'username', 
                       'author_id':'follower_id'}, inplace = True)
    df_no_rt_agg.rename(columns = {'handle': 'username', 
                       'author_id':'follower_id'}, inplace = True)
    
    return df_rt_agg, df_no_rt_agg


# Change Country here
country = 'KE'
base1 = pd.read_parquet(f'../../data/04-analysis/{country}/baseline_batch2.parquet')

if country == 'KE':
    n_end = 9
    n_base = 14
else:
    n_end = 7

In [2]:
### Baseline:

path_tw, base, rand, baseline, agg, agg_base = get_path(country, 'march')

df_final = pd.DataFrame()
for i in tqdm(range(0, n_base)):
    df = pd.read_parquet(f'{agg_base}predicted/baseline_batch2_0{i}.parquet.gzip')
    df_final = pd.concat([df_final, df])

df_final1 = pd.DataFrame()
for i in tqdm(range(0, n_base)):
    df1 = pd.read_parquet(f'{agg_base}predicted/baseline2_batch2_{i}.parquet.gzip')
    df_final1 = pd.concat([df_final1, df1])

df_final = pd.concat([df_final, df_final1]).reset_index(drop=True)

100%|██████████| 14/14 [00:20<00:00,  1.44s/it]
100%|██████████| 14/14 [00:25<00:00,  1.82s/it]


In [3]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18464919 entries, 0 to 18464918
Data columns (total 17 columns):
 #   Column            Dtype  
---  ------            -----  
 0   author_id         object 
 1   id                object 
 2   conversation_id   object 
 3   created_at        object 
 4   text              object 
 5   lang              object 
 6   total_shares      int32  
 7   reply_count       int32  
 8   like_count        int32  
 9   quote_count       int32  
 10  impression_count  int32  
 11  has_text          bool   
 12  has_words         bool   
 13  total_reactions   int64  
 14  total_comments    int64  
 15  verifiability     float64
 16  true              float64
dtypes: bool(2), float64(2), int32(5), int64(2), object(6)
memory usage: 1.8+ GB


In [6]:
df_final['handle'] = df_final['author_id']

df_RT = df_final[(df_final['text'].str.contains('RT @', case=True, regex=False)) & 
                 (df_final['total_comments'] == 0)]
df_RT = df_RT.reset_index(drop=True)

df_no_rt = df_final[(~df_final['text'].str.contains('RT @', 
                    case=True, regex=False)) | 
                    (df_final['total_comments'] > 0)].reset_index(drop=True)

In [5]:
len(df_RT) + len(df_no_rt) == len(df_final)

True

In [7]:
df_agg = summ_followers2(df_final).reset_index(drop=True)
df_rt_agg = summ_followers2(df_RT).reset_index(drop=True)
df_no_rt_agg = summ_followers2(df_no_rt).reset_index(drop=True)

In [11]:
cols = [col for col in df_rt_agg.columns if 'total_' in col] + ['verifiability', 'true', 'n_posts']
df_agg.rename(columns = 
        {col: col + '_base' for col in df_agg.columns if col in cols}, 
        inplace=True)

df_rt_agg.rename(columns = 
        {col: col + '_rt_base' for col in df_rt_agg.columns if col in cols}, 
        inplace=True)

df_no_rt_agg.rename(columns = 
        {col: col + '_no_rt_base' for col in df_no_rt_agg.columns if col in cols}, 
        inplace=True)

df_agg.rename(columns = {'author_id':'follower_id'}, inplace = True)
df_rt_agg.rename(columns = {'author_id':'follower_id'}, inplace = True)
df_no_rt_agg.rename(columns = {'author_id':'follower_id'}, inplace = True)

df_agg = df_agg.drop(['handle'], axis=1)
df_rt_agg = df_rt_agg.drop(['handle'], axis=1)
df_no_rt_agg = df_no_rt_agg.drop(['handle'], axis=1)

In [13]:
base1 = pd.read_parquet(f'../../data/04-analysis/{country}/baseline_batch2.parquet')

In [14]:
base1 = base1.merge(df_rt_agg, on=['follower_id'], 
                  how='left')

base1 = base1.merge(df_agg, on=['follower_id'], 
                  how='left')

base1 = base1.merge(df_no_rt_agg, on=['follower_id'], 
                  how='left')

In [16]:
cols_base = [col for col in base1.columns if '_base' in col]
for x in cols_base:
    base1[x] = np.where(base1[x].isnull(), 0, base1[x])

In [18]:
base1.to_parquet(f'../../data/04-analysis/{country}/baseline/baseline_batch2.parquet')

### Endlines:

In [19]:
base1 = pd.read_parquet(f'../../data/04-analysis/{country}/baseline/baseline_batch2.parquet')

In [20]:
path_tw, base, rand, baseline, agg, agg_base = get_path(country, 'march')

df_final = pd.DataFrame()
for i in range(0, n_end):
    df = pd.read_parquet(f'{agg}predicted/may_batch2{i}.parquet.gzip')
    df_final = pd.concat([df_final, df])

df_final = df_final.reset_index(drop=True)
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8192943 entries, 0 to 8192942
Data columns (total 15 columns):
 #   Column            Dtype  
---  ------            -----  
 0   author_id         object 
 1   id                object 
 2   conversation_id   object 
 3   created_at        object 
 4   text              object 
 5   total_shares      int32  
 6   reply_count       int32  
 7   like_count        int32  
 8   quote_count       int32  
 9   impression_count  int32  
 10  lang2             object 
 11  total_reactions   int64  
 12  total_comments    int64  
 13  verifiability     float64
 14  true              float64
dtypes: float64(2), int32(5), int64(2), object(6)
memory usage: 781.3+ MB


In [21]:
df_final['stage'] = np.where((df_final['created_at'] > '2023-04-31') & (df_final['created_at'] < '2023-05-15'), 
                             1, 2)
metrics = [col for col in df_final.columns if 'total_' in col]
cols = metrics + ['verifiability', 'true']

In [22]:
stage1 = df_final[df_final['stage'] == 1]
stage1['handle'] = stage1['author_id']

stage1_agg = summ_followers2(stage1)
stage1_agg.rename(columns = {'handle':'username', 
                             'author_id':'follower_id'}, inplace = True)
df_rt_agg, df_no_rt_agg = divide_and_conquer(stage1)

stage1_agg = stage1_agg.drop(['username'], axis=1)
df_rt_agg = df_rt_agg.drop(['username'], axis=1)
df_no_rt_agg = df_no_rt_agg.drop(['username'], axis=1)

base1 = base1.merge(df_rt_agg, on=['follower_id'], 
                  how='left')

base1 = base1.merge(df_no_rt_agg, on=['follower_id'], 
                  how='left')

base1 = base1.merge(stage1_agg, on=['follower_id'], 
                  how='left')

cols_base = ([col for col in base1.columns if '_rt' in col] + 
             [col for col in base1.columns if '_no_rt' in col] + 
             cols + ['n_posts'])

for x in cols_base:
    base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
base1.to_parquet(f'{agg}aggregated/stage1_rt_batch2.parquet', 
                index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stage1['handle'] = stage1['author_id']


In [25]:
base1 = pd.read_parquet(f'../../data/04-analysis/{country}/baseline/baseline_batch2.parquet')

In [26]:
stage2 = df_final[df_final['stage'] == 2]
stage2['handle'] = stage2['author_id']

stage2_agg = summ_followers2(stage2)
stage2_agg.rename(columns = {'handle':'username', 
                             'author_id':'follower_id'}, inplace = True)
df_rt_agg, df_no_rt_agg = divide_and_conquer(stage2)

stage2_agg = stage2_agg.drop(['username'], axis=1)
df_rt_agg = df_rt_agg.drop(['username'], axis=1)
df_no_rt_agg = df_no_rt_agg.drop(['username'], axis=1)

base1 = base1.merge(df_rt_agg, on=['follower_id'], 
                  how='left')

base1 = base1.merge(df_no_rt_agg, on=['follower_id'], 
                  how='left')

base1 = base1.merge(stage2_agg, on=['follower_id'], 
                  how='left')

cols_base = ([col for col in base1.columns if '_rt' in col] + 
             [col for col in base1.columns if '_no_rt' in col] + 
             cols + ['n_posts'])

for x in cols_base:
    base1[x] = np.where(base1[x].isnull(), 0, base1[x])
    
base1.to_parquet(f'{agg}aggregated/stage2_rt_batch2.parquet', 
                index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stage2['handle'] = stage2['author_id']
