<a href="https://colab.research.google.com/github/fromakim/2021Election_Analysis/blob/main/data_retrieve.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Retrieve

대선 주자들에 대한 Twitter Data 수집

## 0. Import Packages

In [None]:
import boto3
import json
import pprint
import requests
from datetime import datetime, timedelta

In [None]:
import numpy as np
import pandas as pd

## 1. Define Constants

In [None]:
candidates = [
    '이재명',           # 더불어민주당
    '윤석열',           # 국민의 힘
    '심상정',           # 정의당
    '안철수',           # 국민의 
]

In [None]:
s3 = boto3.client('s3')

## 2. Key Settings

In [None]:
f = open('./key.json')
keys = json.load(f)

In [None]:
api_key = keys['api_key']
api_secret_key = keys['api_secret_key']
bearer_token = keys['bearer_token']
app_id = keys['app_id']

## 3. Sample Test

In [None]:
headers = {"Authorization" : f"Bearer {bearer_token}"}
# url = "https://api.twitter.com/2/tweets/search/recent?max_results=100&query=from:TwitterDev"
# response = requests.request("GET", url, headers=headers).json()

In [None]:
# df = pd.DataFrame(response['data'])
# df.head()

Unnamed: 0,id,text
0,1451609611455242241,Join us on Friday October 29th at 2 pm ET for ...
1,1451533812022726666,Introducing the Twitter Developer Insider Prog...
2,1450948615413788677,SET YOUR REMINDER! Join @i_am_daniele on 10/2...
3,1449982379657875460,RT @chizom_: I need Devs and Mental Health exp...


## 4. Twitter API Call Config

In [None]:
endpoint = 'https://api.twitter.com/2/tweets/search/recent'
yesterday = datetime.now() - timedelta(days = 1)

In [None]:
expansions = ['attachments.poll_ids', 'attachments.media_keys', 'author_id', 'entities.mentions.username', 'geo.place_id', 'in_reply_to_user_id', 'referenced_tweets.id', 'referenced_tweets.id.author_id']
media_fields = ['duration_ms', 'height', 'media_key', 'preview_image_url', 'type', 'url', 'width', 'public_metrics', 'alt_text']
place_fields = ['contained_within', 'country', 'country_code', 'full_name', 'geo', 'id', 'name', 'place_type']
poll_fields = ['duration_minutes', 'end_datetime', 'id', 'options', 'voting_status']
tweet_fields = ['attachments', 'author_id', 'context_annotations', 'conversation_id', 'created_at', 'entities', 'geo', 'id', 'in_reply_to_user_id', 'lang', 'public_metrics', 'possibly_sensitive', 'referenced_tweets', 'reply_settings', 'source', 'text', 'withheld']
user_fields = ['created_at', 'description', 'entities', 'id', 'location', 'name', 'pinned_tweet_id', 'profile_image_url', 'protected', 'public_metrics', 'url', 'username', 'verified', 'withheld']

In [None]:
params = {
    # 'query': '',
    'start_time': yesterday.strftime("%Y-%m-%dT00:00:00Z"),
    'end_time': yesterday.strftime("%Y-%m-%dT23:59:59Z"),
    'max_results': '100',
    'expansions': ','.join(expansions),
    'media.fields': ','.join(media_fields),
    'place.fields': ','.join(place_fields),
    'poll.fields': ','.join(poll_fields),
    'tweet.fields': ','.join(tweet_fields),
    'user.fields': ','.join(user_fields)
}

In [None]:
options = '&'.join([key + '=' + params[key] for key in params])

{'end_time': '2021-10-23T23:59:59Z',
 'expansions': 'attachments.poll_ids,attachments.media_keys,author_id,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id',
 'max_results': '10',
 'media.fields': 'duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics,alt_text',
 'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type',
 'poll.fields': 'duration_minutes,end_datetime,id,options,voting_status',
 'start_time': '2021-10-23T00:00:00Z',
 'tweet.fields': 'attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld',
 'user.fields': 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld'}

## 5. Get Data Files

In [None]:
for cand in candidates:
        print(f'Collecting {cand}\'s tweets')
        
        index = 0
        
        while True:
            res = requests.get(endpoint + f'?query={cand}&' + options, headers = headers).json()
            s3.put_object(Bucket = 'electiondatafile', Key = f'{cand}/{yesterday.strftime("%Y-%m-%d")}/data{index:02d}.json', Body = json.dumps(res))
            
            if 'next_token' in res['meta']:
                params['next_token'] = res['meta']['next_token']
                options = '&'.join([key + '=' + params[key] for key in params])
                index = index + 1
                
                if index == 40:
                    break
                
                continue
            else:
                break