In [2]:
"""Retrieve Content Shared By Influencers During Baseline

KE & SA
-- start date: '2022-08-21T00:00:00Z'
-- end date: '2023-02-21T00:00:00Z'
"""

import pandas as pd
import os
import glob
import yaml
import time
import numpy as np

#from tweetple import TweetPle
import sys
sys.path.insert(0, '../../src/utils')
from funcs import *
from tqdm import tqdm
sys.path.insert(0, '../../src/utils')
from general import *


def scrape_tweets(accounts, bearer_token, path, start_date, end_date):
    """Scrape Tweets"""
    TweetPle.TweetStreamer(
        accounts,
        bearer_token,
        path,
        start_date,
        end_date
    ).main()

    return print('Content scraped')


def scrape_posts(path, accounts, start_date, end_date, token):
    """Scrape Posts"""
    for account in tqdm(accounts):
        time.sleep(10)
        try:
            posts = GetPosts(
                account,
                start_date,
                end_date,
                token).main()
            posts.to_parquet(f'{path}/{account}.parquet')
        except ValueError:
            print(f"Oops!  Not content for {account}.")

    return print('Content scraped')


def get_path(country):
    base = f'../../data/03-experiment/{country}/baseline/'
    path_tw = base + '00-raw/influencers/tweets/'
    path_fb = base + '00-raw/influencers/posts/'
    path_likes = f'{base}00-raw/influencers/likes/'
    path_followers = f'{base}00-raw/influencers/followers/'
    path_converstations = f'{base}00-raw/influencers/conversations/'
    path_retweets = f'{base}00-raw/influencers/retweets/'
    return path_tw, path_fb, base, path_likes, path_followers, path_converstations, path_retweets


def unique_tweets(path_read):
    """Get unique tweets"""
    df = read_files(path_read)
    df = df.drop_duplicates(subset='id').reset_index(drop=True)
    df = expand_column(df, 'entities.urls')

    return df


def unique_posts(path_read, path_save):
    """Get unique posts"""
    df = read_files(path_read)
    df = df.drop_duplicates(subset='postUrl')
    df = df.reset_index(drop=True)
    df.to_parquet(f'{path_save}posts.parquet')
    return df


def tweets_w_likes(path_read):
    """Get Tweets with at least one like"""
    df = read_files(path_read)
    df = df.drop_duplicates(subset='id').reset_index(drop=True)
    df['author_id'] = df['author_id'].astype(str) ### change
    #df = df[df['author_id']=='937779284'] ### change
    df = df[df['public_metrics.like_count'] > 0] 
    df = df.reset_index(drop=True)
    ids = df.sort_values(
        ['public_metrics.like_count'], ascending=False
    )['id']
    ids = list(ids)
    return ids


def tweets_w_retweets(path_read):
    """Get Tweets with at least one like"""
    df = read_files(path_read)
    df = df.drop_duplicates(subset='id').reset_index(drop=True)
    df = df[df['public_metrics.retweet_count'] > 0]
    df = df.reset_index(drop=True)
    ids = df.sort_values(
        ['public_metrics.retweet_count'], ascending=False
    )['id']
    ids = list(ids)
    return ids


def unique_likes(tweets, path_read, path_save):
    """Get unique users `liking users` of Tweets
    shared by influencers
    """
    col_a = 'liking_user_username'
    col_b = 'tweet_liked'
    col_c = 'influencer_id'
    col_d = 'liking_user_id'
    df = read_files(path_read)
    df = df.drop_duplicates(['id', col_b])
    df = df.rename(
        {'id': col_d, 'username': col_a},
        axis=1
    )
    df = df.reset_index(drop=True)
    tweets = tweets[['author_id', 'id']]
    tweets = tweets.rename(
        {'author_id': col_c, 'id': col_b},
        axis=1
    )
    df = df.merge(tweets, on=col_b, how='left')
    df = df[
        [col_d, col_a, col_b, col_c]
    ]
    df.to_parquet(f'{path_save}likes.parquet')

    return df


def unique_replies(tweets, path_read, path_save):
    """Get unique users replying to Tweets
    shared by influencers
    """
    col_a = 'replier_user_id'
    col_b = 'tweet_replied'
    col_c = 'influencer_id'
    df = read_files(path_read)
    df = df.drop_duplicates(['author_id', 'id', 'in_reply_to_user_id'])

    df = df.rename(
        {'author_id': col_a, 'id': col_b, 'in_reply_to_user_id': col_c},
        axis=1
    )
    df = df.reset_index(drop=True)
    df = df[[col_a, col_b, col_c]]
    df.to_parquet(f'{path_save}replies.parquet')

    return df


def scrape_followers(participants_tw, country, path_save):
    b_path = f'../../data/01-characterize/followers/{country}/00-raw/'
    file = 'integrate/followers.gzip'
    a_followers = pd.read_parquet(f'{b_path}{file}')
    a_followers = a_followers.rename(
        {'id': 'follower_id', 'author_id_following': 'influencer_id'},
        axis=1
    )
    participants_tw = participants_tw.rename(
        {'author_id': 'influencer_id'},
        axis=1
    )
    followers = a_followers[
        a_followers.influencer_id.isin(participants_tw.influencer_id.unique())
    ]
    followers = followers.reset_index(drop=True)
    followers.to_parquet(f'{path_save}followers.parquet')
    
    return followers


def tweet_types(df, path_save):
    df0 = df[~df.referenced_tweets.isna()]
    m = pd.DataFrame(df0['referenced_tweets'].tolist())
    m = m[0].apply(pd.Series)
    m.replace({'retweeted': 'retweet', 'replied_to': 'reply',
               'quoted': 'quote'}, inplace=True)
    df0 = pd.concat([df0, m['type']], axis=1)
    df1 = df[df.referenced_tweets.isna()]
    df1['type'] = 'tweet'
    df = df0.append(df1).reset_index(drop=True)
    df = df[~df.id.isna()].reset_index(drop=True)
    create_folder(path_save)
    df.to_parquet(f'{path_save}tweets.parquet')
    return df


def assign_followers_treatment(participants, followers):

    participants = participants[['author_id', 'treatment']]
    participants.rename({'author_id': 'influencer_id'}, axis=1, inplace=True)
    participants['influencer_id'] = participants['influencer_id'].astype(str)
    followers = followers.merge(participants, on='influencer_id', how='left')
    m = pd.DataFrame(followers.groupby('follower_id')[
        'treatment'].apply(list)).reset_index()
    m['set_col'] = np.where(m.treatment == '', '', m.treatment.map(set))
    m['count'] = m.set_col.str.len()
    z = m[m['count'] == 1]
    valid_followers = list(z['follower_id'].unique())
    df0 = followers[followers['follower_id'].isin(valid_followers)]
    df1 = followers[~followers['follower_id'].isin(valid_followers)].drop([
        'treatment'], axis=1)
    df = df0.append(df1).reset_index(drop=True)
    df.rename({'treatment': 'assignment'}, axis=1, inplace=True)
    df['assignment'].replace(
        {1: 'treatment', 0: 'control', np.nan: 'both'}, inplace=True)

    return df


def get_followers_quality(sample, followers, likes, replies, path_save):
    sample['author_id'] = sample['author_id'].astype(str)
    participants = list(sample['author_id'])
    followers['influencer_id'] = followers['influencer_id'].astype(str)

    quality_likes = pd.DataFrame()
    for participant in participants:
        followersp = followers[followers['influencer_id'] == participant]
        likesp = likes[likes['influencer_id'] == participant]
        list_to_look = list(followersp['follower_id'].unique())
        quality_likes = quality_likes.append(
            likesp[likesp['liking_user_id'].isin(list_to_look)]
        )
    quality_likes = quality_likes.rename(
        {'liking_user_username': 'follower_username'},
        axis=1
    )
    quality_likes = quality_likes[[
        'follower_username', 'liking_user_id', 'influencer_id']]
    quality_likes = quality_likes.drop_duplicates()
    quality_likes = quality_likes.rename(
        {'liking_user_id': 'follower_id'}, axis=1)
    quality_likes = quality_likes.reset_index(drop=True)
    quality_likes['liked'] = 1

    quality_replies = pd.DataFrame()
    for participant in participants:
        followersp = followers[followers['influencer_id'] == participant]
        repliesp = replies[replies['influencer_id'] == participant]
        list_to_look = list(followersp['follower_id'].unique())
        quality_replies = quality_replies.append(
            repliesp[repliesp['replier_user_id'].isin(list_to_look)]
        )
    quality_replies = quality_replies[[
        'replier_user_id', 'influencer_id']]
    quality_replies = quality_replies.drop_duplicates()
    quality_replies = quality_replies.rename(
        {'replier_user_id': 'follower_id'}, axis=1)
    quality_replies = quality_replies.reset_index(drop=True)
    quality_replies['replied'] = 1

    followers.rename({'username': 'follower_username'}, axis=1, inplace=True)
    cols_keep = ['follower_username', 'follower_id', 'influencer_id']
    followers = followers[cols_keep]
    followers['influencer_id'] = followers['influencer_id'].astype(str)
    followers = followers.merge(quality_likes, on=cols_keep, how='left')
    followers = followers.merge(
        quality_replies,
        on=['follower_id', 'influencer_id'],
        how='left'
    )
    followers[['replied', 'liked']] = followers[['replied', 'liked']].fillna(0)
    followers['strong'] = np.where(
        (followers.liked == 1) & (followers.replied == 1), 1, 0)
    followers['weak'] = np.where(
        (followers.liked == 1) | (followers.replied == 1), 1, 0)
    followers['weak'] = np.where(followers['strong']== 1, 0, followers['weak'])
    #followers = assign_followers_treatment(sample, followers)
    followers.to_parquet(f'{path_save}followers_ties_batch2.parquet')

    return followers

country = 'KE'
path_tw, path_fb, base, path_likes, path_followers, path_converstations, path_retweets = get_path(country)
path_save = f'{base}01-preprocess/influencers/'

In [3]:
# Read the data:
# Tweets:
tweets = pd.read_parquet(f"{base}01-preprocess/influencers/tweets_batch2.parquet")

# Followers 
followers = pd.read_parquet(f"{base}01-preprocess/influencers/followers_batch2.parquet")

# Likes
likes = pd.read_parquet(f"{base}01-preprocess/influencers/likes_batch2.parquet")

# Replies
replies = pd.read_parquet(f"{base}01-preprocess/influencers/replies_batch2.parquet")

# Participants TW
participants_tw = pd.read_parquet(f'../../data/01-characterize/influencers/{country}/confirmed_influencers_batch2.parquet')
participants_tw['author_id'] = participants_tw['id'].astype(str)
participants_tw = participants_tw.drop(['id'], axis = 1)
usernames_tw = list(participants_tw['handle'])
ids_tw = list(participants_tw['author_id'].astype(str))

In [4]:
get_followers_quality(
        participants_tw,
        followers,
        likes,
        replies,
        path_save
    )

  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(
  quality_likes = quality_likes.append(


Unnamed: 0,follower_username,follower_id,influencer_id,liked,replied,strong,weak
0,martin42639543,1616129113802121217,1382201755,0.0,0.0,0,0
1,mwikya_charles,867711370225176576,1382201755,0.0,0.0,0,0
2,Langat385Langat,1623147872404271106,1382201755,0.0,0.0,0,0
3,WachilongaAllan,1582291034578583553,1382201755,0.0,0.0,0,0
4,MarambaJohn,1420966910,1382201755,0.0,0.0,0,0
...,...,...,...,...,...,...,...
728547,cutthroatbilll,1203186163021942784,1298994593988403206,1.0,0.0,0,1
728548,feeranga_free,1237408812245626880,1298994593988403206,1.0,0.0,0,1
728549,SoloLevi,1213946294009393152,1298994593988403206,1.0,0.0,0,1
728550,EvansMu39861599,1276578573650968576,1298994593988403206,1.0,0.0,0,1
