Stage 1: Data crawler. Massively parallel functions crawling data and storing it in Cloud Object Storage.  Obtain information from web pages or tweets and create a dataset of text data. Use FaaS backend in lithops to launch crawling process over serverless functions.

In [None]:
import ipywidgets as widgets

analysis_folder_wg = widgets.Text(value='analysis1', placeholder='')
items = [widgets.Text(placeholder='String/hashtag #{}'.format(i)) for i in range(6)]
parameterItems = [widgets.Text(placeholder='parameter:value #{}'.format(i)) for i in range(3)]

acc = widgets.Accordion(children=[analysis_folder_wg, widgets.HBox(items), widgets.HBox(parameterItems)])
acc.set_title(0, 'Folder name in storage to save data')
acc.set_title(1, 'Tags to crawl')
acc.set_title(2, 'Parameters')
acc


In [None]:
# You can add more hashtags if the don't fit the boxes manually here:
hashtags = []
parameters = []
for i in items:
    if i.value != '':
        hashtags.append(i.value)
for i in parameterItems:
    if i.value != '':
        parameters.append(i.value)

analysis_folder_name = analysis_folder_wg.value

print('Hashtags:', hashtags)
print('Parameters:',  parameters)
print('Folder:',  analysis_folder_name)

# Save them on jupyter kernel so we can retrieve the hashtags on Stage 2 notebook
%store analysis_folder_name

In [None]:
from custom_snscrape import TwitterSearchScraper

def scrapSearch(str_to_search, storage):

    tweets = ""
    counter = 0

    for tweet in TwitterSearchScraper(str_to_search).get_items():
        tweets = tweets + tweet.to_json() + '\n'

        counter = counter + 1
        if counter == 8000:
            break

    # 'key' is only the sought hashtag, without the excluded ones
    storage.put_object(bucket=storage.bucket,
                        key=analysis_folder_name+'/'+str_to_search.split()[0],
                        body=tweets)

    return "OK"

In [None]:
# Each query will include the current hashtag,
# and exclude the hashtags to the right of itself in the array.
# This way, we can parallelise the searches, while also ensuring that
# if a tweet contains multiple hashtags, one of the queries includes said tweet.
# 
# In this case, the queries will be:
# #covid -#sars -#coronavirus, #sars -#coronavirus, #coronavirus
#hashtags = ['#covid', '#sars', '#coronavirus']
queries = []

for i in range(len(hashtags)):
    s = ' -'.join(hashtags)
    for p in parameters:
        s += " " + p
    queries.append(s)
    hashtags.pop(0)
print(queries)

In [None]:
import datetime

# Find tweets since this date
range_start = datetime.date(2021, 2, 1)

# Interval of time that each function will cover
interval = datetime.timedelta(days=15)

# How many intervals in total 
# The search will cover from range_start to range_start + n_intervals*interval
n_intervals = 4

ranges = ['since:{} until:{}'.format(range_start+x*interval, range_start+(x+1)*interval) for x in range(n_intervals)]

queries_w_date_ranges = ['{} {}'.format(q, r) for q in queries for r in ranges]

print(queries_w_date_ranges)

In [None]:
import lithops

with lithops.FunctionExecutor(log_level='debug') as fexec:
    all_invocations = fexec.map(scrapSearch, queries_w_date_ranges)
    print(fexec.get_result())