# Pulling Twitter Data

In [1]:
import tweepy

In [2]:
# For sending GET requests from the API
import requests

# For saving access tokens and for file management when creating and adding to the dataset
import os

# For dealing with json responses received from the API
import json

# For displaying and managing the data
import pandas as pd

# For saving the data into a CSV format
import csv

# For parsing the dates received from Twitter into readable formats
import datetime as dt
import dateutil.parser
import unicodedata

#To add wait time between requests
import time

## Authentication

Autheticate via `local-api-key.txt`, which is just a file in the root directory of my project that is ignored by github (see the `.gitignore` file).  The contents of this file are just:

```
API_Key: <text of the key>
API_Secret: <text of the secret>
Bearer: <text of the bearer>
```
It turns out that all we need is the Bearer text, so the other two lines are really optional.


In [3]:
creds = {}
with open("../../../local-api-key.txt") as f:
    for line in f:
        pieces = line.split(":")
        #print(pieces)
        creds[pieces[0].strip()]=pieces[1].strip()


In [4]:
# Authentication method for the v2 Twitter API
client = tweepy.Client(creds['Bearer'])

### Media Account Names and IDs

Create a dictionary of all accounts with twitter ids here

In [5]:
handle_ids = {"crooksandliars":14513611}

## Identifying Candidate Followers

In [None]:
# Helper function to write the follower results incrementally to a file as they are retrieved 
def write_results(account,candidates):
    file = f"./{account}.csv"
    new = not os.path.exists(file)
    with open(file,"a") as f:
        out = csv.DictWriter(f,["id","tweet_count","created_at","name","username","location"])
        if new:
            out.writeheader()
        out.writerows(candidates)




# The following loop runs over the "handle_ids" dictionary and retrieves followers
# for each account, and then saves them to a file.

# I'm picking people that have been on the site since 2012,1,1 and have at least 500
# posts.  Ultimately, I'm going to screen out people with too many tweets as well - we
# can fill those in later

# Note that this function will give us more than min_candidates (200) followers.  Doing this
# in the hopes we'll find a subset of users that work for the study (i.e. relatively even 
# participation over the decade)

start = dt.datetime(2022,1,1)
min_desired_posts = 500
max_desired_posts = 50000
desired_tenure = 10
min_candidates = 200

for account,id in handle_ids.items():
    done = False
    token = None
    candidate_count = 0
    call_count = 2
    
    while candidate_count < min_candidates and not done:
        candidates = []
        result = client.get_users_followers(id,expansions=None,max_results=1000,pagination_token=token,
            user_fields = ["created_at","protected","public_metrics","location"])
        
        call_count+=1
        #print(f"Results meta: {result.meta}")
        if len(result.errors) > 0 or call_count % 15 == 0:
            #presume rate limit, sleep for 15 min (ugh)
            call_count = 0
            print("Sleep for 15 minutes...")
            time.sleep(60 * 15)
        else:
            token = result.meta["next_token"]
            if not token:
                done = True
            for u in result.data:
                if not u.protected:
                    data = {}
                    data['tweet_count'] = u.public_metrics['tweet_count']
                    data['created_at'] = u.created_at.replace(tzinfo=None)
                    if data['tweet_count'] >= min_desired_posts and data['tweet_count'] <= max_desired_posts and (start-data['created_at']).days/365 >= desired_tenure:
                        data['id'] = u.id
                        data['name'] = u.name
                        data['username'] = u.username
                        data['location'] = u.location
                        candidates.append(data)
            candidate_count += len(candidates)        
            print(f"Call {call_count} added {len(candidates)} for {candidate_count} total")
            write_results(account,candidates)
           
                     
                
            
        



# Getting the user's history


In [50]:
# This function takes the response from Twitter (which is split into a main response and an "includes" portion)
# and then merges them together.  It is expecting json as attributes (data, includes) and returns a single
# merged json document.  You don't need to call this, as it is being called by other routines below
# 
# Note that this is *destructive* (it modifies the original object) so we'll want to be careful.
# Also note, though I'm printing out a warning when there is no data, the code will still
# error out when this happens.  I'm still not sure what's going on, but it doesn't seem to occur
# frequently enough for it to matter.  I will return to this later on, or maybe you can investigate.
# Just keep track of these errors (see below).

def merge_data(data,includes):
    if not data:
        print("Warning: no data to merge!!!")

    if not includes:
        return data
    user_map = {u.id:u.data for u in includes['users']} if 'users' in includes else None
    tweet_map = {t.id:t.data for t in includes['tweets']} if 'tweets' in includes else None
    #print(tweet_map)
    for tweet in data:
        tweet_obj = tweet.data
        if 'referenced_tweets' in tweet_obj and tweet_map:
            full_text = None
            for idx,rt in enumerate(tweet['referenced_tweets']):
                rt_id = int(rt['id'])
                if int(rt_id) in tweet_map:
                    tweet_obj['referenced_tweets'][idx]['expanded'] = tweet_map[rt_id]
                    if rt['type']=="retweeted":
                        full_text = tweet_map[rt_id]['text']
                
                    
            if full_text:
                rt_part = tweet_obj['text'].split(": ")[0]
                tweet_obj['text'] = f"{rt_part}: {full_text}"
        if user_map and 'entities' in tweet_obj and 'mentions' in tweet_obj['entities']:
            for idx, user in enumerate(tweet['entities']['mentions']):
                user_id = int(user['id'])
                if user_id in user_map:
                    tweet_obj['entities']['mentions'][idx]['expanded'] = user_map[user_id]


    return(data)


The following is just for writing the user's tweets to a json file.  I do a very little bit of editing to make sure we have a properly formatted file.  I like to keep the raw json files around in case we screw up the csv conversion

In [58]:
import json

def write_user_data(account,user_id,data,finalize = False):
   
    file = f"./{user_id}_{account}.json"
    new = not os.path.exists(file)
    with open(file,"a") as f:
        if new:
            f.write("[")
        if data:
            for idx,tweet in enumerate(data):
                f.write(json.dumps(tweet.data))
                if idx < len(data) -1:
                    f.write(",")
        if finalize:
            f.write("]")
        else:
            f.write(",")

## Collecting the user data

The following is the loop to collect data from the actual users.  It is in fact opening up the users file we created before, and pulling the ids out of that.  You will want to filter that file to make sure we only have english speaking folks before we pull their entire history.  So, your first step is to build that logic.  You can base that on the function below

In [33]:
import math

backoff = 10
start = dt.datetime.fromisoformat("2006-03-21 00:00:00") #earliest possible tweet

# These are referencing the indices of the user file; useful to control things if we error out
# and I like to work in batches anyway just to keep an eye on things.  If I were more confident everything
# was working, I'd just let it go.  Perhaps as you grow more confident, you will choose to do that. 
start_idx = 0
end_idx = 50

# This is the 50k cutoff I chose above
max_tweets = 50000

for account,id in handle_ids.items():
    done = False
    token = None
    candidate_count = 0
    call_count = 2
    total_tweets = 0
    users_file =  f"./{account}.csv"
    with open(users_file) as fin:
        reader = csv.DictReader(fin)
        users = [u for u in reader]

    # Note that we could do a lot more here to check and see which user we left off with (in case something
    # went wrong) but for now we're just going to hope that things run smoothly enough that we can manage 
    # it by hand using the start and end index
    for i,u in enumerate(users):
        if i<start_idx:
            continue
        if int(u['tweet_count']) > max_tweets:
            print(f"Skipping {u['username']} because they have {u['tweet_count']} tweets")
            continue
        print(f"Processing {i} of {len(users)} : {u['username']} (about {u['tweet_count']} posts)")
        # Twitter API rate limits us at 300 calls per 15 min, which works out to 
        # 3 seconds per call; so we'll just sleep for that length of time and catch errors
        # with a backoff
        done = False
        user_tweet_count = 0
        error_count = 0
        #This is just to provide some feedback for (roughly) every 10% of a users tweets recovered
        counter_increment = int(u['tweet_count']) / 10
        

        while not done:
            error = False
            try:
                result = client.search_all_tweets(query=f"from:{u['id']}",max_results=500,next_token = token, 
                    start_time = start,
                    place_fields="full_name,country_code",
                    expansions = "referenced_tweets.id,in_reply_to_user_id,entities.mentions.username,referenced_tweets.id.author_id",
                    user_fields = "description",
                    tweet_fields = "text,created_at,conversation_id,entities,in_reply_to_user_id,public_metrics,referenced_tweets")
            
            except Exception as e:
                print(f"Error: {e}")
                error = True
            if error or (len(result.errors) > 0 and result.meta['result_count'] == 0):
                print(f"{result.errors[0]} and {len(result)-1} others")
                
                #presume rate limit, retry three times with incremental backoff
                if error_count ==3:
                    print("Too many errors, bailing")
                    done = True
                    break
                error_count+=1
                print(f"Sleep for {(backoff ** error_count)/60} minutes")
                time.sleep(backoff ** error_count)
            else:
                error_count = 0
                token = result.meta.get("next_token")
                if not token:
                    done = True
                data = merge_data(result.data,result.includes)
                if not data:
                    print(f"No data for {u['id']} : {result.meta})")
                write_user_data(account,u["id"],data, done)
                old_count = user_tweet_count
                user_tweet_count+=result.meta["result_count"]
                if math.floor(user_tweet_count/counter_increment) > math.floor(old_count/counter_increment):
                    print(".",end="")
            time.sleep(3)
        print()
        if i>end_idx:
            print("Bailing...")
            break
    break

Processing 284 of 321 : ZachRarick (about 625 posts)
..
Skipping bebe57roy because they have 60753 tweets
Processing 286 of 321 : adacarey (about 7173 posts)
.........
Processing 287 of 321 : charleydeppner (about 6862 posts)
.........
Processing 288 of 321 : EricXWest (about 11763 posts)
.......
Processing 289 of 321 : MadonnaFigura (about 8055 posts)
.........
Skipping shanasshots because they have 137095 tweets
Processing 291 of 321 : CarmeninSD (about 24213 posts)

Processing 292 of 321 : Karenb1038 (about 1884 posts)
....
Processing 293 of 321 : tabbycat79 (about 5067 posts)
.........
Processing 294 of 321 : unabombershack (about 6392 posts)
.........
Processing 295 of 321 : katbanner (about 1796 posts)
....
Processing 296 of 321 : k_macC_ (about 1761 posts)
....
Processing 297 of 321 : LesleyKSmith (about 15145 posts)
.........
Processing 298 of 321 : lmlinflorida (about 1120 posts)
...
Skipping clc202000 because they have 200966 tweets
Skipping sbryt because they have 113947 twe

Once you are done, go ahead and upload the json files to the relevant directories on the [google drive](https://drive.google.com/drive/u/1/folders/18FX2b3edkKJUZU6IafrY2Dd1xzWprDGR).  

### Noting errors here

Keep track of any errors you encounter here, so we can go back and fix things up!

1.  User 25258457 - stopped at "2011-06-17T12:14:21.000Z" but there are another 500 tweets or so.  Need a special query
2.  User 89788214 - Missing the last frame because of an error in merge - last tweet: "2010-04-18T01:22:01.000Z".  Need a special query to get these.
3.  User 150768615 - stopped at "2010-08-22T23:26:13.000Z" - could be the very first tweet for this user?  Unclear how this could happen.
4.  User 65109417 - Screwed up and appended the all of these posts again; need to search for the first tweet to repeat and truncate the file
5. Unclear error with 182446931 - no data?  Need to return to this one and try again.
6. Similarly, no data for 72795851
