# Tweepy - Crawler - TCC

In [1]:
import tweepy
import emoji
import pandas as pd
import numpy as np
import time

In [2]:
bearer_token = "put here your bearer token from your Twitter developer account"

### Emojis Functions

In [3]:
def detect_emojis(text):
    emoji_str = ''
    for c in text:
        if emoji.is_emoji(c) == True:
            emoji_str = emoji_str + c
    if len(emoji_str) > 0:
        return ''.join(sorted(set(emoji_str), key=emoji_str.index))  # to remove duplicated from list
    else:
        return 'no emojis'

def translate_emoji(emojis, language='pt'):
    translate_str = ''
    for emj in emojis:
        translate_str = translate_str + emoji.demojize(emj, language=language) + ', '
    return translate_str[:-2]

def remove_emojis(text):
    emojis_list = []

    for c in text:
        if emoji.is_emoji(c) == True:
            emojis_list.append(c)
            
    if len(emojis_list) > 0:
        for emj in emojis_list:
            text = text.replace(emj,'')    
    return text

### Getting Tweet counts (volume) for a search query

In [447]:
client = tweepy.Client(bearer_token)

# Replace with your own search query
QUERY_LULA = 'Lula -is:retweet -has:links -has:mentions lang:pt context:35.862070591737675776'
QUERY_BOZO = 'Bolsonaro -is:retweet -has:links -has:mentions lang:pt context:35.912697101083041792'
QUERY_CIRO = 'Ciro Gomes -is:retweet -has:links -has:mentions lang:pt context:35.912370288968458240'
QUERY_SIMONE = 'Simone Tebet -is:retweet -has:links -has:mentions lang:pt context:35.1091083297654886400'

QUERY = QUERY_LULA

START_TIME = '2022-10-29T00:00:00.000Z'
END_TIME = '2022-10-29T23:59:00.000Z'

counts = client.get_recent_tweets_count(query=QUERY, 
                                        granularity='day',
                                        start_time=START_TIME,
                                        end_time=END_TIME)

total = 0

for count in counts.data:
    print(count)
    total = total + count['tweet_count']
print('\nTotal:', total)

{'end': '2022-10-29T23:59:00.000Z', 'start': '2022-10-29T00:00:00.000Z', 'tweet_count': 273068}

Total: 273068


### Getting more than 100 tweets at a time using paginator
* Building queries for search tweets: https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query#list

In [448]:
client = tweepy.Client(bearer_token)

id_list = []
created_at_list = []
text_list = []
context_list = []
place_list = []

# Replace with your own search query
# QUERY = 'Lula -is:retweet' #place_country:BR'
# Excluding retweets, tweets with links and with mentions

TWEET_FIELDS = ['context_annotations', 'created_at']
N_TWEETS = 1200

# Replace the limit with the maximum number of Tweets you want
for tweet in tweepy.Paginator(client.search_recent_tweets, 
                              query=QUERY,
                              start_time=START_TIME,
                              end_time=END_TIME,
                              tweet_fields=TWEET_FIELDS,
                              max_results=100).flatten(limit=N_TWEETS):
    id_list.append(tweet.id)
    created_at_list.append(tweet.created_at)
    text_list.append(tweet.text.replace('\n',''))
    
    if len(tweet.context_annotations) > 0:
        context_list.append(tweet.context_annotations)
    else:
        context_list.append('unavailable context annotations')
    
d = {'id':id_list, 'date':created_at_list, 'tweet':text_list, 'context_annotations':context_list} #, 'place':place_list}
df_tweets = pd.DataFrame(data=d)
df_tweets.shape

(1200, 4)

In [450]:
df_tweets['tweet_clean'] = df_tweets['tweet'].apply(lambda twt: remove_emojis(twt)) 
df_tweets['emojis'] = df_tweets['tweet'].apply(lambda twt: detect_emojis(twt))
df_tweets['emojis_translated'] = df_tweets['emojis'].apply(lambda emj: translate_emoji(emj) if emj != 'no emojis' else emj)
df_tweets.shape

(1200, 7)

In [451]:
df_tweets.sample(5)

Unnamed: 0,id,date,tweet,context_annotations,tweet_clean,emojis,emojis_translated
954,1586504716762370048,2022-10-29 23:46:14+00:00,"🔴⚫️ Esqueça tudo, menos de sair de casa pra vo...","[{'domain': {'id': '3', 'name': 'TV Shows', 'd...","️ Esqueça tudo, menos de sair de casa pra vota...",🔴⚫✅⭐,":círculo_vermelho:, :círculo_preto:, :marca_de..."
571,1586505990215639040,2022-10-29 23:51:18+00:00,agora só falta o lula amanhã pra completar,"[{'domain': {'id': '10', 'name': 'Person', 'de...",agora só falta o lula amanhã pra completar,no emojis,no emojis
770,1586505307089272832,2022-10-29 23:48:35+00:00,Lula terminou o primeiro turno com 48% deve te...,"[{'domain': {'id': '29', 'name': 'Events [Enti...",Lula terminou o primeiro turno com 48% deve te...,no emojis,no emojis
422,1586506522707873792,2022-10-29 23:53:25+00:00,Imagens lindas da paulista hoje 🤗Amanhã é LULA ❤️,"[{'domain': {'id': '10', 'name': 'Person', 'de...",Imagens lindas da paulista hoje Amanhã é LULA ️,🤗❤,":rosto_abraçando:, :coração_vermelho:"
81,1586507601504014337,2022-10-29 23:57:42+00:00,"Não pregamos ódio, nem fazemos apologia a arma...","[{'domain': {'id': '10', 'name': 'Person', 'de...","Não pregamos ódio, nem fazemos apologia a arma...",🔥🚩🙌🌈🏆❤,":fogo:, :bandeira_triangular:, :mãos_para_cima..."


In [452]:
df_tweets['date'].min(), df_tweets['date'].max()

(Timestamp('2022-10-29 23:43:17+0000', tz='UTC'),
 Timestamp('2022-10-29 23:58:58+0000', tz='UTC'))

In [453]:
# Save tweets...
file_name = 'lula-221029-221029-' + str(df_tweets.shape[0]) + '.csv'
df_tweets.to_csv(file_name)

In [13]:
# Number of registers with emojis
count_emojis = 0
register_emojis = []

for index,text in enumerate(df_tweets['emojis']):
    if text != 'no emojis':
        count_emojis = count_emojis + 1
        register_emojis.append(index)

print('Número total de registros:', df_tweets.shape[0])
print('Número de registros com emojis:', count_emojis)

Número total de registros: 43922
Número de registros com emojis: 6109
