In [1]:
import pandas as pd
import os
import glob
import numpy as np
import re

def convert_to_numeric(characters):
    try:
        numeric_values = re.sub(',', '', characters)
        numeric_values = pd.to_numeric(re.sub("K|M|mil", "", numeric_values))
        if 'K' in characters:
            numeric_values *= 1000
        elif 'mil' in characters:
            numeric_values *= 1000
        elif 'M' in characters:
            numeric_values *= 1000000
        else:
            numeric_values = numeric_values
    except:
        numeric_values = 0
    return numeric_values

for country in ['SA', 'KE']:
    all_files = glob.glob(os.path.join(f'../../../data/03-experiment/{country}/treatment/followers/01-preprocess/predicted_correct/', "*.parquet.gzip"))
    df_f = pd.concat((pd.read_parquet(f) for f in all_files), ignore_index=True)
    df_f = df_f.reset_index(drop=True).drop(['ordinal', 'equal', 'dumdum'], axis = 1)
    df_f.rename(columns = {'TimeStamp': 'created_at', 'retweets':'total_shares', 
                      'likes':'like_count', 'replies':'total_comments', 'false':'fake', 
                       'lang2':'lang', 'username':'reposted', 
                       'follower_handle':'username'}, inplace = True)
    df_f['non_ver'] = np.where((df_f['verifiability'] == 0) & (~df_f['verifiability'].isnull()),
                                1, 0)
    df_f['total_shares'] = df_f['total_shares'].astype(str)
    df_f['total_shares'] = df_f['total_shares'].apply(convert_to_numeric)
    df_f['total_shares'] = np.where(df_f['total_shares'].isnull(), 0, df_f['total_shares'])
    df_f['total_comments'] = df_f['total_comments'].astype(str)
    df_f['total_comments'] = df_f['total_comments'].apply(convert_to_numeric)
    df_f['total_comments'] = np.where(df_f['total_comments'].isnull(), 0, df_f['total_comments'])
    df_f['like_count'] = df_f['like_count'].astype(str)
    df_f['like_count'] = df_f['like_count'].apply(convert_to_numeric)
    df_f['like_count'] = np.where(df_f['like_count'].isnull(), 0, df_f['like_count'])
    df_f['total_reactions'] = df_f['like_count'] + df_f['total_shares'] + df_f['total_comments']
    df_f['total_reactions'] = np.where(df_f['total_reactions'].isnull(), 0, df_f['total_reactions'])
    df_f['date'] = pd.to_datetime(df_f['created_at'].astype(str).str[:10])
    df_f['filter'] = np.where((df_f['username'] != df_f['reposted']) & 
                          (df_f['type'].isnull()), 1, 0)
    df_f['id'] = df_f['username'].astype(str) + df_f['created_at'].astype(str)
    df_f = df_f[df_f['filter'] == 0].reset_index(drop = True).drop(['filter'], axis = 1)
    
    df_f.to_parquet(f'../../../data/03-experiment/{country}/treatment/followers/01-preprocess/correct_cases_final.parquet.gzip', 
              index = False,
              compression = 'gzip')

In [2]:
df_f

Unnamed: 0,reposted,created_at,text,total_comments,total_shares,like_count,type,username,id,has_text,has_words,lang,verifiability,true,fake,non_ver,total_reactions,date
0,HonMoses_Kuria,2023-07-15 16:38:16+00:00,Atakufa Wednesday 19th azikiwe Saturday 29th. ...,6300.0,3500.0,10000.0,gideon kipngeno reposted,gideonk50866791,gideonk508667912023-07-15 16:38:16+00:00,True,True,en,1.0,0.0,1.0,0,19800.0,2023-07-15
1,TrupyWupy_,2023-06-21 06:11:59+00:00,How a titanic train runs on a pencil thin rail...,1.0,13.0,86.0,gideon kipngeno reposted,gideonk50866791,gideonk508667912023-06-21 06:11:59+00:00,True,True,en,1.0,0.0,1.0,0,100.0,2023-06-21
2,gideonk50866791,2023-02-12 05:27:49+00:00,Meru county ultimately scores there for the ra...,0.0,0.0,0.0,,gideonk50866791,gideonk508667912023-02-12 05:27:49+00:00,True,True,en,1.0,1.0,0.0,0,0.0,2023-02-12
3,LarryMadowo,2010-12-22 13:23:32+00:00,JUST IN: The MPs under investigation for drug ...,241.0,12000.0,15000.0,Edwin Okoti reposted,edokot,edokot2010-12-22 13:23:32+00:00,True,True,en,1.0,0.0,1.0,0,27241.0,2010-12-22
4,CSakwah,2024-07-24 10:40:51+00:00,I can't wait for that morning TV political deb...,91.0,2800.0,6500.0,Edwin Okoti reposted,edokot,edokot2024-07-24 10:40:51+00:00,True,True,en,1.0,1.0,0.0,0,9391.0,2024-07-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7361378,Angie_angeline7,2023-02-12 03:49:22+00:00,Good morning people,16.0,16.0,43.0,kamau wanjiku reposted,kamau_wanjiqu,kamau_wanjiqu2023-02-12 03:49:22+00:00,True,True,en,0.0,,,1,75.0,2023-02-12
7361379,brfootball,2023-02-11 17:09:48+00:00,Next up for Arsenal: Manchester City,236.0,2024.0,31300.0,kamau wanjiku reposted,kamau_wanjiqu,kamau_wanjiqu2023-02-11 17:09:48+00:00,True,True,en,0.0,,,1,33560.0,2023-02-11
7361380,kamau_wanjiqu,2023-02-11 20:03:59+00:00,Follow for immediate following back,0.0,0.0,0.0,,kamau_wanjiqu,kamau_wanjiqu2023-02-11 20:03:59+00:00,True,True,en,0.0,,,1,0.0,2023-02-11
7361381,bhadgurlrih,2023-02-10 15:20:22+00:00,"Money aside, what else do you need right now?",110.0,66.0,365.0,kamau wanjiku reposted,kamau_wanjiqu,kamau_wanjiqu2023-02-10 15:20:22+00:00,True,True,en,0.0,,,1,541.0,2023-02-10
