Stage 1: Data crawler. Massively parallel functions crawling data and storing it in Cloud Object Storage.  Obtain information from web pages or tweets and create a dataset of text data. Use FaaS backend in lithops to launch crawling process over serverless functions.

In [None]:
import lithops

In [None]:
from custom_snscrape.twitter import TwitterSearchScraper

def scrapSearch(str_to_search, storage):
    for tweet in TwitterSearchScraper(str_to_search).get_items():
        # key_prefix is only the included hashtag, without the excluded ones
        key_prefix = str_to_search.split()[0]

        storage.put_object(bucket=storage.bucket,
                              key=key_prefix+'/'+str(tweet.id),
                              body=tweet.to_json())
        
        break # testing: only 1 result

    return "OK"

In [None]:
import ipywidgets as widgets
items = [widgets.Text(placeholder='Hashtag #{}'.format(i)) for i in range(5)]
widgets.HBox(items)

In [None]:
# You can add more hashtags if the don't fit the boxes manually here:
hashtags = []
for i in items:
    if i.value != '':
        hashtags.append(i.value)


# Save them on jupyter kernel so we can retrieve the hashtags on Stage 2 notebook
%store hashtags

print(hashtags)

In [None]:
# Each query will include the current hashtag,
# and exclude the hashtags to the right of itself in the array.
# This way, we can parallelise the searches, while also ensuring that
# if a tweet contains multiple hashtags, one of the queries includes said tweet.
# 
# In this case, the queries will be:
# #covid -#sars -#coronavirus, #sars -#coronavirus, #coronavirus
#hashtags = ['#covid', '#sars', '#coronavirus']
queries = []

for i in range(len(hashtags)):
    queries.append(' -'.join(hashtags))
    hashtags.pop(0)

In [None]:
with lithops.FunctionExecutor() as fexec:
    all_invocations = fexec.map(scrapSearch, queries)
    print(fexec.get_result())

In [None]:
Tweets: 
1) Pedro Sanchez
 https://twitter.com/sanchezcastejon/status/1388184494235127814
 https://twitter.com/sanchezcastejon/status/1376241393396543492
 https://twitter.com/sanchezcastejon/status/1371013139987365888

2)  Boris Johnson
https://twitter.com/BorisJohnson/status/1391413912889921542
https://twitter.com/BorisJohnson/status/1363552116133023744
https://twitter.com/BorisJohnson/status/1354147835369959435

3) Joe Biden
https://twitter.com/JoeBiden/status/1386426190861197313
https://twitter.com/POTUS/status/1373037818277167112
https://twitter.com/POTUS/status/1384136220284915712
https://twitter.com/POTUS/status/1368993395172139014

4) Elon Musk
https://twitter.com/elonmusk/status/1256240766600409088


