In [1]:
import pyarrow
import pandas as pd
import os
import glob
import yaml
import time
import numpy as np

from os import listdir
from os.path import isfile, join

import sys
sys.path.insert(0, '../../src/utils')
from funcs import *
from tqdm import tqdm
sys.path.insert(0, '../../src/utils')
from general import *

def unique_replies(tweets, df, path_save):
    """Get unique users replying to Tweets
    shared by influencers
    """
    col_a = 'replier_user_id'
    col_b = 'tweet_replied'
    col_c = 'influencer_id'

    df = df.drop_duplicates(['author_id', 'id', 'in_reply_to_user_id'])

    df = df.rename(
        {'author_id': col_a, 'id': col_b, 'in_reply_to_user_id': col_c},
        axis=1
    )
    df = df.reset_index(drop=True)
    df = df[[col_a, col_b, col_c]]
    df.to_parquet(f'{path_save}replies_batch2.parquet')

    return df

def unique_likes(tweets, df, path_save):
    """Get unique users `liking users` of Tweets
    shared by influencers
    """
    col_a = 'liking_user_username'
    col_b = 'tweet_liked'
    col_c = 'influencer_id'
    col_d = 'liking_user_id'
    df = df.drop_duplicates(['id', col_b])
    df = df.rename(
        {'id': col_d, 'username': col_a},
        axis=1
    )
    df = df.reset_index(drop=True)
    tweets = tweets[['author_id', 'id']]
    tweets = tweets.rename(
        {'author_id': col_c, 'id': col_b},
        axis=1
    )
    df = df.merge(tweets, on=col_b, how='left')
    df = df[
        [col_d, col_a, col_b, col_c]
    ]
    #df.to_parquet(f'{path_save}likes_batch2.parquet')

    return df

def scrape_followers(participants_tw, country, path_save):
    b_path = f'../../data/01-characterize/followers/{country}/00-raw/'
    file = 'integrate/followers_batch2.parquet.gzip'
    a_followers = pd.read_parquet(f'{b_path}{file}')
    a_followers = a_followers.rename(
        {'id': 'follower_id', 'author_id_following': 'influencer_id'},
        axis=1
    )
    participants_tw = participants_tw.rename(
        {'author_id': 'influencer_id'},
        axis=1
    )
    followers = a_followers[
        a_followers.influencer_id.isin(participants_tw.influencer_id.unique())
    ]
    followers = followers.reset_index(drop=True)
    followers = followers[['username','follower_id', 'influencer_id']]
    followers.to_parquet(f'{path_save}followers_batch2.parquet')
    
    return followers

def get_path(country):
    base = f'../../data/03-experiment/{country}/baseline/'
    path_tw = base + '00-raw/influencers/tweets/'
    path_fb = base + '00-raw/influencers/posts/'
    path_likes = f'{base}00-raw/influencers/likes/'
    path_followers = f'{base}00-raw/influencers/followers/'
    path_converstations = f'{base}00-raw/influencers/conversations/'
    path_retweets = f'{base}00-raw/influencers/retweets/'
    path_save =  f'{base}01-preprocess/influencers/'
    return path_tw, path_fb, base, path_likes, path_followers, path_converstations, path_retweets, path_save

country = 'KE'
path_tw, path_fb, base, path_likes, path_followers, path_converstations, path_retweets, path_save = get_path(country)

tweets = pd.read_parquet(f"{base}01-preprocess/influencers/tweets_batch2.parquet")

In [2]:
# Replies
df1 = tweets.drop_duplicates(subset='id').reset_index(drop=True)

df1 = df1[df1.type != 'reply'] 
df1 = df1[df1['public_metrics.reply_count'] > 0]
df1 = df1.sort_values(by='public_metrics.reply_count', ascending=False)

conversation_ids = list(df1['id'].unique())

onlyfiles = [f for f in listdir(path_converstations) if isfile(join(path_converstations, f))]
onlyfiles = [f.replace('.parquet', '') for f in onlyfiles]

ids_final = list(set(conversation_ids).intersection(onlyfiles))
len(ids_final)

9854

In [3]:
df = pd.DataFrame()
for ids in ids_final:
    df_int = pd.read_parquet(f'{path_converstations}{ids}.parquet')
    df = pd.concat([df, df_int]).reset_index(drop=True)
    
unique_replies(tweets, df, path_save)

Unnamed: 0,replier_user_id,tweet_replied,influencer_id
0,82643815,1601561706081419264,1415647994
1,1415647994,1601480628465446912,82643815
2,1551299223630581760,1618173387221921792,723119586564919297
3,1521819285131251712,1617858750546837505,723119586564919297
4,884086378128183296,1581258609522257920,3198730404
...,...,...,...
87444,1450448144655949828,1561277371994591232,1391988889955545089
87445,1515702085253279749,1561277017055793152,1391988889955545089
87446,1525879841102385152,1561276773349957633,1525879841102385152
87447,1525879841102385152,1561276711223926786,1391988889955545089


In [2]:
df1 = tweets.drop_duplicates(subset='id').reset_index(drop=True)

df1 = df1[df1['public_metrics.like_count'] > 0]
df1 = df1.reset_index(drop=True)
ids_tweets = df1.sort_values(
    ['public_metrics.like_count'], 
    ascending=False)['id']
ids_tweets = list(ids_tweets)

onlyfiles = [f for f in listdir(path_likes) if isfile(join(path_likes, f))]
onlyfiles = [f.replace('.parquet', '') for f in onlyfiles]

ids_final = list(set(ids_tweets).intersection(onlyfiles))
len(ids_final)

50276

In [None]:
for i in range(0, 51):
    df = pd.DataFrame()
    a = 1000*i
    b = 1000*(i+1)
    for ids in tqdm(ids_final[a:b]):
        df_int = pd.read_parquet(f'{path_likes}{ids}.parquet')
        df = pd.concat([df, df_int]).reset_index(drop=True)
    likes = unique_likes(tweets, df, path_save)
    likes.to_parquet(f'{path_save}intermediate/likes_batch2{i}.parquet')


In [5]:
df_final = pd.DataFrame()
for i in range(0,51):
    df_int = pd.read_parquet(f'{path_save}intermediate/likes_batch2{i}.parquet')
    df_final = pd.concat([df_final, df_int]).reset_index(drop=True)
df_final.to_parquet(f'{path_save}likes_batch2.parquet')

In [8]:
participants_tw = pd.read_parquet(f'../../data/01-characterize/influencers/{country}/confirmed_influencers_batch2.parquet')
participants_tw['author_id'] = participants_tw['id'].astype(str)
participants_tw = participants_tw.drop(['id'], axis = 1)
usernames_tw = list(participants_tw['handle'])
ids_tw = list(participants_tw['author_id'].astype(str))

In [9]:
# Followers:

follow = scrape_followers(participants_tw, country, path_save)

In [98]:
glob.glob(path_converstations + "/*.parquet")[8160] #posiciones: 709, 8160 
# archivos: 1604709183168839680.parquet, 1605907529652903936.parquet, 1605960786920312832.parquet

'../../data/03-experiment/SA/baseline/00-raw/influencers/conversations/1605960786920312832.parquet'

In [3]:
replies = unique_replies(tweets, path_converstations, path_save)

In [26]:
participants_tw

Unnamed: 0,handle,author_id
0,PhumiekaLiyo,1495782642
1,KAmuses,238478300
2,saaleha,13714272
3,SISONKE_MD,453721930
4,MlamuliSA,1267959115
5,maseogane,184017607
6,innomagagulajr,1006061273417699328
7,Malatjie_,32943963
8,C_liveDj,16517479
9,TSJ_Letlapa,1169197110081851392


In [4]:
# Likes
likes = unique_likes(tweets, path_likes, path_save)

NameError: name 'likes' is not defined

In [None]:
# Conversations
replies = read_files(path_converstations)
replies.to_parquet(f'{path_save}replies.parquet')