# Development

Agreement and policy: https://developer.twitter.com/en/developer-terms/agreement-and-policy

In [1]:
import os
os.chdir('..')

import pandas as pd
from typing import Tuple
from datetime import datetime, timedelta

from modules import (
    TwitterRequest,
    GPTFeatureExtractor
)

In [11]:
start_date = datetime(2023, 4, 27, 00, 00)
end_date = datetime(2023, 4, 29, 00, 00)
    
candidates = [
    'carlos pineda', 'sandra torres'
]

max_results = 10
tweets_prefix = 'tw_'
users_prefix = 'us_'

In [12]:
class TwitterGPTDataProcessor:
    def __init__(
            self, 
            candidates: list[str], 
            start_date: datetime,
            end_date: datetime, 
            max_results: int,
            tweets_prefix: str, 
            users_prefix: str
        ) -> None:
        self.candidates = candidates
        self.start_date = start_date
        self.end_date = end_date
        self.max_results = max_results
        self.tweets_prefix = tweets_prefix
        self.users_prefix = users_prefix

    def generate_dates(self) -> "TwitterGPTDataProcessor":
        delta = timedelta(days=1)
        self.dates = []

        while self.start_date < self.end_date:
            self.next_date = self.start_date + delta
            self.dates.append(
                (self.start_date.isoformat() + "Z", self.next_date.isoformat() + "Z")
            )
            self.start_date = self.next_date

        return self

    def fetch_twitter_data(self, candidate: str, start_date: datetime, end_date: datetime) -> Tuple[pd.DataFrame, pd.DataFrame]:
        tweets, users = (
            TwitterRequest(
                query=candidate,
                start_time=start_date,
                end_time=end_date,
                max_results=self.max_results
            )
            .request()
            .extract_tweets()
            .extract_users()
            .segregate()
            .preprocess(
                tweets_prefix=self.tweets_prefix,
                users_prefix=self.users_prefix
            )
        )
        return tweets, users

    def process_data_with_gpt(self, tweets: pd.DataFrame, users: pd.DataFrame, candidate: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
        tweets_with_gpt_features = (
            GPTFeatureExtractor(tweets=tweets)
            .preprocess_text()
            .extract_features(prefix=self.tweets_prefix)
        )

        tweets_with_gpt_features[f"{self.tweets_prefix}candidate"] = candidate
        users[f"{self.users_prefix}candidate"] = candidate

        return tweets_with_gpt_features, users

    def collect_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        self.generate_dates()
        tweets_collector, users_collector = [], []
        
        for candidate in self.candidates:
            dates_tweets_collector, dates_users_collector = [], []
            
            for start_date, end_date in self.dates:
                tweets, users = self.fetch_twitter_data(candidate, start_date, end_date)
                tweets_with_gpt_features, users = self.process_data_with_gpt(tweets, users, candidate)

                dates_tweets_collector.append(tweets_with_gpt_features)
                dates_users_collector.append(users)

            tweets_collector.append(pd.concat(dates_tweets_collector))
            users_collector.append(pd.concat(dates_users_collector))

        self.tweets = pd.concat(tweets_collector, axis=0, ignore_index=True)
        self.users = pd.concat(users_collector, axis=0, ignore_index=True)

        return self.tweets, self.users

In [13]:
processor = TwitterGPTDataProcessor(
    candidates=candidates,
    start_date=start_date,
    end_date=end_date,
    max_results=max_results,
    tweets_prefix=tweets_prefix,
    users_prefix=users_prefix
)

tweets, users = processor.collect_data()

In [14]:
tweets.to_csv('tweets_processed.csv', index=False)
users.to_csv('users_processed.csv', index=False)