# Data Scraping

## Twitter Data

Here I scrape tweets from @nytimes made between 2022-01-01 and mid-April 2022 using the Twitter API, twint, and snscrape.

In [None]:
''' Getting tweets from @nytimes using the Twitter API v2.
    Unfortunately, I was only able to get 30 days of tweets using this method.
'''

import requests
import os
import time
import json

twitterAPI = 'https://api.twitter.com/2/users/'
nytUserID = '807095' # Twitter ID for the New York Times
startTime = '2022-01-01T00:00:00.000Z' # Beginning of the year
endTime = '2022-04-30T00:00:00.000Z' # End of April

resultingJSON = {'meta': {'next_token': ''}}
while True:
    try:
        pagination_token = resultingJSON['meta']['next_token']
        if pagination_token != '':
            pagination_token = 'pagination_token=' + pagination_token
    except KeyError:
        break
    fields = ('/tweets?max_results=100&' +
                pagination_token +
                '&start_time=' + startTime +
                '&end_time=' + endTime +
                '&tweet.fields=id,created_at,text,author_id,in_reply_to_user_id,referenced_tweets,attachments,geo,entities,public_metrics,source,context_annotations,conversation_id&media.fields=media_key,duration_ms,height,preview_image_url,type,url,width,public_metrics,alt_text'
    )
    URL = twitterAPI + nytUserID + fields

    req = requests.get(URL, headers = {'Authorization': f'Bearer {os.environ["BEARER_TOKEN"]}'})
    resultingJSON = req.json()
    
    with open('nyt_' + pagination_token + '.json', 'w') as file:
        json.dump(resultingJSON, file)

    time.sleep(10) # To keep the number of requests well below the Twitter rate limit

In [None]:
''' Getting tweets from @nytimes using twint.
    Unfortunately, I was only able to get 10 days of tweets using this method.  Moreover, retweets are missing.
'''

import twint
config = twint.Config()
config.Username = 'nytimes'
config.Since = '2022-01-01'
config.Store_json = True
config.Output = 'nyt_data_twint.json'
config.Retweets = True

twint.run.Search(config)

In [None]:
''' Getting tweets from @nytimes using snscrape, on command line.
    This method seems to be able to get tweets from Jan 2022, unlike the Twitter API and twint.  However, retweets are missing.
'''

# snscrape --jsonl --progress --since 2022-01-01 twitter-search "from:nytimes until:2022-04-30" >> nyt_twitter_data.json