In [1]:
import pandas as pd
import os
import glob
import numpy as np
import re

def convert_to_numeric(characters):
    try:
        numeric_values = re.sub(',', '', characters)
        numeric_values = pd.to_numeric(re.sub("K|M|mil", "", numeric_values))
        if 'K' in characters:
            numeric_values *= 1000
        elif 'mil' in characters:
            numeric_values *= 1000
        elif 'M' in characters:
            numeric_values *= 1000000
        else:
            numeric_values = numeric_values
    except:
        numeric_values = 0
    return numeric_values

for country in ['SA', 'KE']:
    all_files = glob.glob(os.path.join(f'../../../data/03-experiment/{country}/treatment/followers/01-preprocess/predicted_correct/', "*.parquet.gzip"))
    df_f = pd.concat((pd.read_parquet(f) for f in all_files), ignore_index=True)
    df_f = df_f.reset_index(drop=True).drop(['ordinal', 'equal', 'dumdum'], axis = 1)
    df_f.rename(columns = {'TimeStamp': 'created_at', 'retweets':'total_shares', 
                      'likes':'like_count', 'replies':'total_comments', 'false':'fake', 
                       'lang2':'lang', 'username':'reposted', 
                       'follower_handle':'username'}, inplace = True)
    df_f['non_ver'] = np.where((df_f['verifiability'] == 0) & (~df_f['verifiability'].isnull()),
                                1, 0)
    df_f['total_shares'] = df_f['total_shares'].astype(str)
    df_f['total_shares'] = df_f['total_shares'].apply(convert_to_numeric)
    df_f['total_shares'] = np.where(df_f['total_shares'].isnull(), 0, df_f['total_shares'])
    df_f['total_comments'] = df_f['total_comments'].astype(str)
    df_f['total_comments'] = df_f['total_comments'].apply(convert_to_numeric)
    df_f['total_comments'] = np.where(df_f['total_comments'].isnull(), 0, df_f['total_comments'])
    df_f['like_count'] = df_f['like_count'].astype(str)
    df_f['like_count'] = df_f['like_count'].apply(convert_to_numeric)
    df_f['like_count'] = np.where(df_f['like_count'].isnull(), 0, df_f['like_count'])
    df_f['total_reactions'] = df_f['like_count'] + df_f['total_shares'] + df_f['total_comments']
    df_f['total_reactions'] = np.where(df_f['total_reactions'].isnull(), 0, df_f['total_reactions'])
    df_f['date'] = pd.to_datetime(df_f['created_at'].astype(str).str[:10])
    df_f['filter'] = np.where((df_f['username'] != df_f['reposted']) & 
                          (df_f['type'].isnull()), 1, 0)
    df_f['id'] = df_f['username'].astype(str) + df_f['created_at'].astype(str)
    df_f = df_f[df_f['filter'] == 0].reset_index(drop = True).drop(['filter'], axis = 1)
    
    df_f.to_parquet(f'../../../data/03-experiment/{country}/treatment/followers/01-preprocess/correct_cases_final.parquet.gzip', 
              index = False,
              compression = 'gzip')