Stage 2:  Data preprocessing stage to produce structured data in csv format also stored in Cloud Object Storage. As columns in the csv file we suggest date, geographic location, url, and sentiment analysis.

In [None]:
import lithops

In [None]:
from transformers import pipeline
from custom_snscrape.twitter import Tweet

def map_preprocess(obj) -> dict:

    data_body_str = obj.data_stream.read().decode('utf-8')    
    tweet = Tweet.from_json(data_body_str)


    # if tweet.lang is not a supported language, translate it:
    # https://huggingface.co/transformers/main_classes/pipelines.html#transformers.TranslationPipeline

    #nlp = pipeline("ner")
    # https://huggingface.co/transformers/task_summary.html#named-entity-recognition


    # this model is the best one, I tested the others available and they're quite bad...
    classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment") # return_all_scores=True? to make better charts perhaps?
    sentiment = classifier(tweet.content)

    # if it has any attached image:
    # https://huggingface.co/transformers/main_classes/pipelines.html#transformers.ImageClassificationPipeline
    # or text-generation??

    # tweet.media.previewUrl if tweet.media and tweet.media.type == 'photo' else None

    
    # TODO: Think if we should return a list of results (storing scrapped list of tweets inside a single bucket entry) so that a single func 
    # inferences on multiple tweets

    # TODO: Perhaps cast it all to string so we don't have to import the library inside reduce_to_csv
    return { 'id': tweet.id,
             'user': tweet.user,
             'content': tweet.content,
             'lang': tweet.lang,
             'coordinates': tweet.coordinates,
             'retweetCount': tweet.retweetCount,
             'likeCount': tweet.likeCount,
             'quoteCount': tweet.quoteCount,
             'replyCount': tweet.replyCount,
             'mentionedUsers': tweet.mentionedUsers,
             'outlinks': tweet.outlinks,
             'sentiment': sentiment[0]['label'] }


In [None]:
import csv
from io import StringIO


def reduce_to_csv(results, storage):
    
    # BUG: Because we are passing dataclasses from a pickled module inside map() function as its results, we must
    # import something from the library that contains the dataclasses or lithops won't find it.
    # If you don't believe me, remove this line :)
    from custom_snscrape.twitter import Tweet
    #############################################################################################################

    column_names = results[0].keys()
    csv_buffer = StringIO()

    with StringIO() as csv_buffer:
        dict_writer = csv.DictWriter(csv_buffer, column_names)
        dict_writer.writeheader()
        dict_writer.writerows(results)

        storage.put_object(bucket=storage.bucket,
                        key='test.csv',
                        body=csv_buffer.getvalue())

    return "reduced result to cos://{}/test.csv output and stored it".format(storage.bucket)

In [None]:
hashtags = []

# Retrieve hashtags from Stage 1 notebook
%store -r hashtags

if len(hashtags) == 0:
    print('You have not defined any hashtags yet (use Stage 1 notebook or the \'hashtags\' variable directly)')

# Prepend bucket name and append / to the hashtags
bucket_name = lithops.Storage().bucket
data_location = [bucket_name+'/'+htag+'/' for htag in hashtags]

In [None]:
with lithops.FunctionExecutor(runtime='gilarasa/lithops-cloudbutton-challenge-py3.9:0.5', runtime_memory=2048) as fexec:
  # , log_level='debug'

  #  TODO: should aggregate in chunks the tweets so that a single function can preprocess multiple tweets

  fexec.map_reduce(map_preprocess, data_location, reduce_to_csv) # obj_chunk_number is used to divide the input string in chunks, so not to be used unless we store more than one tweet in each object (a list of tweets)
  print(fexec.get_result())