In [1]:
import tweetple

from tweetple import TweetPle
import pyarrow
import pandas as pd
import os
import glob
import yaml
import time
import numpy as np

import sys
sys.path.insert(0, '../../src/utils')
from funcs import *
from tqdm import tqdm
sys.path.insert(0, '../../src/utils')
from general import *

def unique_replies(tweets, path_read, path_save):
    """Get unique users replying to Tweets
    shared by influencers
    """
    col_a = 'replier_user_id'
    col_b = 'tweet_replied'
    col_c = 'influencer_id'
    df = read_files(path_read)
    df = df.drop_duplicates(['author_id', 'id', 'in_reply_to_user_id'])

    df = df.rename(
        {'author_id': col_a, 'id': col_b, 'in_reply_to_user_id': col_c},
        axis=1
    )
    df = df.reset_index(drop=True)
    df = df[[col_a, col_b, col_c]]
    df.to_parquet(f'{path_save}replies.parquet')

    return df

def unique_likes(tweets, path_read, path_save):
    """Get unique users `liking users` of Tweets
    shared by influencers
    """
    col_a = 'liking_user_username'
    col_b = 'tweet_liked'
    col_c = 'influencer_id'
    col_d = 'liking_user_id'
    df = read_files(path_read)
    df = df.drop_duplicates(['id', col_b])
    df = df.rename(
        {'id': col_d, 'username': col_a},
        axis=1
    )
    df = df.reset_index(drop=True)
    tweets = tweets[['author_id', 'id']]
    tweets = tweets.rename(
        {'author_id': col_c, 'id': col_b},
        axis=1
    )
    df = df.merge(tweets, on=col_b, how='left')
    df = df[
        [col_d, col_a, col_b, col_c]
    ]
    df.to_parquet(f'{path_save}likes.parquet')

    return df

def scrape_followers(participants_tw, country, path_save):
    b_path = f'../../data/01-characterize/followers/{country}/00-raw/'
    file = 'integrate/followers.parquet.gzip'
    a_followers = pd.read_parquet(f'{b_path}{file}')
    a_followers = a_followers.rename(
        {'id': 'follower_id', 'author_id_following': 'influencer_id'},
        axis=1
    )
    participants_tw = participants_tw.rename(
        {'author_id': 'influencer_id'},
        axis=1
    )
    followers = a_followers[
        a_followers.influencer_id.isin(participants_tw.influencer_id.unique())
    ]
    followers = followers.reset_index(drop=True)
    followers = followers[['username','follower_id', 'influencer_id']]
    followers.to_parquet(f'{path_save}followers.parquet')
    
    return followers

def get_path(country):
    base = f'../../data/03-experiment/{country}/baseline/'
    path_tw = base + '00-raw/influencers/tweets/'
    path_fb = base + '00-raw/influencers/posts/'
    path_likes = f'{base}00-raw/influencers/likes/'
    path_followers = f'{base}00-raw/influencers/followers/'
    path_converstations = f'{base}00-raw/influencers/conversations/'
    path_retweets = f'{base}00-raw/influencers/retweets/'
    path_save =  f'{base}01-preprocess/influencers/'
    return path_tw, path_fb, base, path_likes, path_followers, path_converstations, path_retweets, path_save

country = 'SA'
path_tw, path_fb, base, path_likes, path_followers, path_converstations, path_retweets, path_save = get_path(country)

In [2]:
participants_tw = pd.read_parquet(f'../../data/01-characterize/influencers/{country}/confirmed_influencers.parquet')
participants_tw['author_id'] = participants_tw['id'].astype(str)
participants_tw = participants_tw.drop(['id'], axis = 1)
usernames_tw = list(participants_tw['handle'])
ids_tw = list(participants_tw['author_id'].astype(str))

In [2]:
tweets = pd.read_parquet(f"{base}01-preprocess/influencers/tweets.parquet")

In [98]:
glob.glob(path_converstations + "/*.parquet")[8160] #posiciones: 709, 8160 
# archivos: 1604709183168839680.parquet, 1605907529652903936.parquet, 1605960786920312832.parquet

'../../data/03-experiment/SA/baseline/00-raw/influencers/conversations/1605960786920312832.parquet'

In [3]:
replies = unique_replies(tweets, path_converstations, path_save)

In [3]:
# Followers:
follow = scrape_followers(participants_tw, country, path_save)

In [4]:
# Likes
likes = unique_likes(tweets, path_likes, path_save)

In [None]:
# Conversations
replies = read_files(path_converstations)
replies.to_parquet(f'{path_save}replies.parquet')