# Development

Agreement and policy: https://developer.twitter.com/en/developer-terms/agreement-and-policy

In [1]:
import os
os.chdir('..')

import requests
import tweepy
import pandas as pd
from modules import TwitterAuthenticator

authenticator = TwitterAuthenticator()
client = tweepy.Client(bearer_token=authenticator.bearer_token)

¿En qué necesito pensar?

- Conectarme a RDS
- Esquema para extraer datos uniformemente durante los últimos 7 días
- Establecer qué políticos tomar en cuenta. Scrape the list from here: https://es.wikipedia.org/wiki/Elecciones_generales_de_Guatemala_de_2023
- ¿Guardar los datos en una base de datos SQL local?


## Get tweets

In [5]:
def get_tweets(query: str, start_time: str, end_time: str, max_results: int) -> requests.Response:
    query = query = f"{query} -is:retweet -is:reply"
    tweets = client.search_recent_tweets(
        query = query,
        start_time = start_time,
        end_time = end_time,
        max_results = max_results,
        tweet_fields = [
            "id", "author_id", "created_at", "text", 
            "public_metrics", "possibly_sensitive", "lang"
        ],
        user_fields = [
            "id", "username", "name", "location", "created_at", "description", 
            "profile_image_url", "verified", "public_metrics"
        ],
        expansions = [
            "author_id", "referenced_tweets.id"
        ]
    )
    return tweets

## Process them into DataFrames

In [3]:
class TwitterDataCleaner:
    def __init__(self, tweets: requests.Response):
        self.tweets = tweets
        self.df = None

    def clean(self) -> pd.DataFrame:
        self.extract_tweet_data()
        self.extract_user_data()

        tweets_df, users_df = self.segregate_data()
        return tweets_df, users_df

    def extract_tweet_data(self):
        tweet_data = []
        for tweet in self.tweets.data:
            tweet_dict = {key: getattr(tweet, key) for key in tweet.data.keys()}
            public_metrics = tweet_dict.pop('public_metrics')
            tweet_dict.update(public_metrics)
            tweet_data.append(tweet_dict)

        self.df = pd.DataFrame(tweet_data)

    def extract_user_data(self):
        users = {user.id: user for user in self.tweets.includes['users']}
        for key, user in users.items():
            user_data = {f"user_{key}": getattr(user, key) for key in user.data.keys()}
            public_metrics_user = user_data.pop('user_public_metrics')
            user_data.update({f"user_{k}": v for k, v in public_metrics_user.items()})
            users[key] = user_data

        self.df['user_data'] = self.df['author_id'].apply(lambda x: users[x])

        user_columns = pd.json_normalize(self.df['user_data']).columns
        for col in user_columns:
            self.df[col] = self.df['user_data'].apply(lambda x: x.get(col, None))

        self.df = self.df.drop(columns = ['user_data'])

    def segregate_data(self):
        tweets_df = self.df[[
            "id", "author_id", "created_at", "text", "possibly_sensitive", "retweet_count",
            "reply_count", "like_count", "quote_count", "impression_count", "lang"
        ]]

        users_df = self.df[[
            "user_id", "user_username", "user_name", "user_location", "user_created_at",
            "user_description", "user_profile_image_url", "user_verified",
            "user_followers_count", "user_following_count", "user_tweet_count", "user_listed_count"
        ]]

        return tweets_df, users_df

## Store them in a RDS

In [9]:
def ingest_tweets(tweets: pd.DataFrame, users: pd.DataFrame, path: str) -> None:
    return None

## Loop through multiple days and candidates

¿Debería hacer append a un dataframe o cargar los datos a una base de datos SQL?

In [11]:
# candidates = [
#     'JoeBiden', 'KamalaHarris', 'BernieSanders'
# ]

# for candidate in candidates:
#     query = candidate

#     for date in dates:
#         start_time = '2021-01-01T00:00:00Z'
#         end_time = '2021-01-02T00:00:00Z'
#         max_results = 10

#         tweets_call = get_tweets(query, start_time, end_time, max_results)
#         cleaner = TwitterDataCleaner(tweets_call)
#         tweets, users = cleaner.clean()
        
#         ingest_tweets(tweets, users, path)

## Example usage

In [6]:
query = 'jimmy morales'
start_time = "2023-04-25T00:00:00Z" 
end_time = "2023-04-28T00:00:00Z" 
max_results = 10

tweets_call = get_tweets(
    query=query,
    start_time=start_time,
    end_time=end_time,
    max_results=max_results,
)

data_cleaner = TwitterDataCleaner(tweets_call)
tweets, users = data_cleaner.clean()

In [7]:
tweets.head()

Unnamed: 0,id,author_id,created_at,text,possibly_sensitive,retweet_count,reply_count,like_count,quote_count,impression_count,lang
0,1651730016894369794,2167302383,2023-04-27 23:28:18+00:00,Candidato a diputado del partido Cabal @keagui...,False,74,3,118,0,2199,es
1,1651725155222138880,1207033185697226757,2023-04-27 23:08:59+00:00,"🥴 Carlos Pineda, ofreció disculpas al exmandat...",False,0,0,0,0,87,es
2,1651724993175142401,307054730,2023-04-27 23:08:20+00:00,Jimmy Morales fue un muerto de hambre y un com...,False,0,0,0,0,79,es
3,1651724182764306433,14083287,2023-04-27 23:05:07+00:00,Hasta ahora Jimmy Morales es el expresidente q...,False,5,1,24,1,2761,es
4,1651718789019361282,556554879,2023-04-27 22:43:41+00:00,Y todavía dice Jimmy Morales que le agradezcam...,False,3,0,6,0,457,es


In [8]:
users.head()

Unnamed: 0,user_id,user_username,user_name,user_location,user_created_at,user_description,user_profile_image_url,user_verified,user_followers_count,user_following_count,user_tweet_count,user_listed_count
0,2167302383,Fiscalnvicto,✺ƑísϲɑƖ Ӏղѵíϲեօ✺ ⚖️,,2013-11-04 02:17:40+00:00,,https://pbs.twimg.com/profile_images/163650153...,False,2484,461,7670,5
1,1207033185697226757,NuestraGente_GT,Nuestra Gente GT,Guatemala,2019-12-17 20:22:13+00:00,Agencia de Noticias y de información para toda...,https://pbs.twimg.com/profile_images/128092703...,False,281,61,1500,0
2,307054730,eloficial2023,Dr. PRESIDENCIABLE😜😜😜,Guatemala,2011-05-28 23:12:33+00:00,NUNCA CALLARÉ PORQUE CREO EN LA LIBERTAD DE EX...,https://pbs.twimg.com/profile_images/163606382...,False,1080,490,50628,10
3,14083287,julioserrano,Julio Serrano Echeverría,Guatemala,2008-03-05 14:41:37+00:00,"Poeta, autor de literatura infantil, documenta...",https://pbs.twimg.com/profile_images/164956958...,False,9207,2298,69134,70
4,556554879,CZardetto,carol zardetto,,2012-04-18 02:13:42+00:00,,https://abs.twimg.com/sticky/default_profile_i...,False,462,218,15220,2
