In [1]:
import os
import json
import pickle 
import time

import pandas as pd
import numpy as np

from transformers import (
    AutoTokenizer, AutoModelForTokenClassification, pipeline    
)

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [2]:
CATEGORIES = [
    'social distancing', 'wearing mask', 'breaking rule', 'discrimination', 'vaccination', 'economic impact', 'covid-19 outbreak', 
    'panic buying', 'government and health authorities', 'being compassionate and helpful', 'school reopening', 
    'teaching children at home', 'alcohol or drug abuse', 'domestic violence'
]

In [3]:
def token_replace(text, page):
    new_text = []
    text = text.replace('\n', ' ').replace("\'s", "'s").replace(page, '').strip()
    refs = True

    for t in text.split(" "):
        if (t.startswith('@') or t.startswith('http')) and refs:
            pass
        else:
            refs = False
            t = '[NAME]' if t.startswith('@') and len(t) > 1 else '' if t.startswith('http') else t
            new_text.append(t)
        
    return " ".join(new_text).strip()

def NER_replace(text):
    end_name = -2
    replace_names = []
    ner_result = xlm_roberta_nlp(text)
    for name in ner_result:
        if name['entity'] == 'I-PER':
            if name['start'] <= end_name+1:
                replace_names[-1] = ''.join([replace_names[-1], name['word'].replace('▁', ' ')])
                end_name = name['end']
            else:
                replace_names = replace_names + [name['word'].replace('▁', ' ')]
                end_name = name['end']
    replace_names = [name.strip() for name in replace_names]

    for name in replace_names:
        text = text.replace(name, '[NAME]')

def render_text(comment, page, NER=False):
    
    text = comment.content
    text = token_replace(text, page)
    if NER:
        text = NER_replace(text, page)        
    return text

def get_elegible(comment):
    
    text = comment.renderedContent    
    comment_criteria = len(text.split()) > 3
    words_criteria = (len(text) / (len(text.split())+1)) > 2
    lang_criteria = comment.lang == 'en'
    
    return comment_criteria and words_criteria and lang_criteria

def get_zero_shot_classification(tweet):
        
    if tweet.elegible == False:
        return ''
    
    text = tweet.renderedContent
    result = facebook_zsc(text, CATEGORIES, multi_label=True)
    tmp = pd.DataFrame(columns=result['labels'])
    tmp.loc[0] = result['scores']
    return [tweet.tweetId] + list(tmp[CATEGORIES].values[0])

In [4]:
# Get Facebook Zero Shot Classifier
facebook_zsc = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)

In [None]:
news_tweets = pd.read_parquet('./../data/raw/news_tweets.parquet')
news_accounts = pd.read_parquet('./../data/raw/news_accounts.parquet')
news_tweets_zsc = pd.DataFrame([])
print('Start processing.')
for user in news_tweets.userId.unique():
    start = time.time()
    page = news_accounts[news_accounts.userId==user].username.values[0]
    tmp = news_tweets[news_tweets.userId==user]
    tmp['renderedContent'] = tmp.apply(lambda x: render_text(x, '@'+page), axis=1)
    tmp['elegible'] = tmp.apply(lambda x: get_elegible(x), axis=1)
    zsc = tmp.apply(lambda x: get_zero_shot_classification(x), axis=1)
    zsc = pd.DataFrame(list(zsc), columns=['tweetId'] + CATEGORIES)
    zsc = zsc[~(zsc.tweetId.isnull())]
    zsc.to_parquet(page + '_zsc.parquet')
    news_tweets_zsc = news_tweets_zsc.append(zsc)
    print('Page:', page, '| Time:', time.time()-start)

Start processing.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['renderedContent'] = tmp.apply(lambda x: render_text(x, '@'+page), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['elegible'] = tmp.apply(lambda x: get_elegible(x), axis=1)


Page: ABC | Time: 5592.094683647156


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['renderedContent'] = tmp.apply(lambda x: render_text(x, '@'+page), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['elegible'] = tmp.apply(lambda x: get_elegible(x), axis=1)
