# Purpose
This notebook was used to fetch the Twitter ID of US politicians using Wikidata, and then construct a data set of their tweets using the Twitter API.

# Fetch Twitter ID's

In [18]:
import requests

def query_party(party):
    
    # Query Wikidata public endpoint
    # Results need to be paged 10000 at a time
    
    party_dict = {
        'democrats' : 'wd:Q29552',
        'republicans' : 'wd:Q29468'
    }
    
    party_curi = party_dict[party]
    
    query = """
        SELECT DISTINCT ?tid WHERE {
          ?person wdt:P102 %s;
                  wdt:P2002 ?tid.
                  
        }""" % party_curi
    
    params = {
        'query' : query,
        'format' : 'json'
    }
    result = requests.get("https://query.wikidata.org/sparql", params = params)
    
    return result

In [16]:
democrat_request = query_party('democrats')
republican_request = query_party('republicans')

In [13]:
len(democrat_request.json()['results']['bindings']), len(republican_request.json()['results']['bindings'])

(834, 890)

In [15]:
democrat_tids = [row['tid']['value'] for row in democrat_request.json()['results']['bindings']]
republican_tids = [row['tid']['value'] for row in republican_request.json()['results']['bindings']]

In [16]:
democrat_tids[:10]

['AlecBaldwin',
 'GabbyGiffords',
 'fergie',
 'merlehaggard',
 'felicityhuffman',
 'TedTurnerIII',
 'carlysimonhq',
 'madeleine',
 'GRRMspeaking',
 'nancypelosi']

# Fetch Tweets

In [2]:
bearer = ''

In [72]:
# DOC: https://developer.twitter.com/en/docs/tweets/timelines/api-reference/get-statuses-user_timeline.html
def request_user_timeline(twitter_id):
    params = {
        'screen_name' : twitter_id,
        'trim_user' : '1',
        'count' : '200',
        'exlucde_replies' : '1',
        'include_rts' : '0',
        'tweet_mode': 'extended'
    }
    headers = {
        'Authorization' : 'Bearer %s' % bearer
    }

    r = requests.get('https://api.twitter.com/1.1/statuses/user_timeline.json', headers=headers, params=params)
    r.raise_for_status()
    
    return [tweet['full_text'] for tweet in r.json()]
        

In [83]:
from tqdm import tqdm_notebook
def fetch_party_tweets(dictionary, tids):
    for i, tid in tqdm_notebook(enumerate(tids)):
        try:
            dictionary[tid] = request_user_timeline(tid)
        except requests.HTTPError as e:
            if e.response.status_code == 429:
                print("Rate Limit Exceeded at index %d" % i)
                raise(e)
            else:
                print("Status Code %d for user %s" % (e.response.status_code, tid))
                continue

In [77]:
democrat_tweets = {}
fetch_party_tweets(democrat_tweets, democrat_tids)

Status Code 404 for user nancypelosi
Status Code 401 for user foofightersdave
Status Code 401 for user RepMikeCapuano
Status Code 404 for user SenBennetCo
Status Code 401 for user RepGeneGreen
Status Code 404 for user whiphoyer
Status Code 404 for user Clyburn
Status Code 404 for user RepCorrineBrown
Status Code 404 for user RepSamFarr
Status Code 404 for user elizabethforma
Status Code 404 for user RepBrady
Status Code 401 for user AmbassadorPower
Status Code 404 for user RepSinema
Status Code 404 for user espyforsenate
Status Code 401 for user RepBecerra
Status Code 404 for user BrightforFCVT
Status Code 404 for user MayorBenMcAdams
Status Code 404 for user RepEsty
Status Code 404 for user repmurphyfl
Status Code 404 for user wfpertharin
Status Code 401 for user epmurphy
Status Code 401 for user Schneiderman
Status Code 401 for user RepEvanJenkins
Status Code 401 for user joemorelle
Status Code 401 for user LorettaLynch
Status Code 404 for user SenatorKelly
Status Code 401 for user A

In [79]:
len(democrat_tweets)

790

In [81]:
democrat_tweets[democrat_tids[600]]

['I had the opportunity to spend the morning with @GovEvers, and many other amazing leaders in education, on the fabulous Northside of Madison at Mendota Elementary School. Investing in our kids is the best investment we can make— hands down!  #PublicSchoolsWork https://t.co/oNxMeYcgk1',
 'Legislators can be mothers too. @EmergeAmerica @EmergeWI @emilyslist @SheShouldRun @RepSargent @ChrisTaylorWI #wisconsin https://t.co/lEvHICfONE',
 'It’s way past time and #Wisconsin should be joining in legalizing #marijuana all the way, for recreational purposes as well  @CapTimes @StevenElbow  https://t.co/6hHnCmeGgP',
 'Each of us can act today to achieve planetary health. As global citizens we must first take responsibility for the state and stewardship of our planet, both through our vote and through direct action. #ClimateChange is bigger than any political moment https://t.co/GniTkMvxKz',
 'I have been &amp; always will be a champion for fair &amp; transparent redistricting process. The peopl

In [84]:
republican_tweets = {}
fetch_party_tweets(republican_tweets, republican_tids)

A Jupyter Widget

Status Code 404 for user RepKevinYoder
Status Code 404 for user RepDaveCamp
Status Code 404 for user RepJoeHeck
Status Code 401 for user RepLynnJenkins
Status Code 401 for user DaveReichert
Status Code 401 for user governorgilmore
Status Code 404 for user RepLoBiondo
Status Code 404 for user CongCulberson
Status Code 401 for user RepWestmoreland
Status Code 404 for user RepErikPaulsen
Status Code 404 for user RepKristiNoem
Status Code 401 for user RepMikePompeo
Status Code 404 for user RepJoePitts
Status Code 401 for user RepJeffDenham
Status Code 401 for user RepBillShuster
Status Code 401 for user BrianSandoval
Status Code 404 for user ohncornyn
Status Code 401 for user govsandoval
Status Code 401 for user usreprodney
Status Code 401 for user USRepRodney
Status Code 401 for user senatorlugar
Status Code 401 for user Favre4Official
Status Code 401 for user reprohrabacher
Status Code 404 for user RepBoustany
Status Code 404 for user repmickmulvaney
Status Code 401 for user peterroskam


In [86]:
len(republican_tweets)

821

In [85]:
republican_tweets[republican_tids[500]]

['It was my honor to vote to re-elect Treasurer David Lillard and Comptroller Justin Wilson to continue their great work. Because of their leadership, Tennessee is one of the top three best financially managed states, and has the third-best funded state pension plan in the nation. https://t.co/ScxPxHNipu',
 'The 2019 legislative session is off to a great start! https://t.co/FEnnWMTDcU',
 'This afternoon, I took the oath of office as the 111th General Assembly was gaveled into session. I am grateful for the continued opportunity to serve in the State Senate and sincerely appreciate the confidence placed in me by the people of Shelby County. https://t.co/JxjdQ0wrwn',
 'I love West Tennessee, but I have to admit East Tennessee is beautiful, too! Amanda, Bella, and I had a great time. #Smokies #TrailDog @TravelGburg @GatlinburgTN https://t.co/LHvypj4ORv',
 'I’ll return to Nashville next week as the 111th General Assembly convenes on Tuesday. Here is a preview of my legislative priorities f

In [87]:
import functools

In [91]:
n_democrat_tweets = functools.reduce(lambda x, y: x + len(y), democrat_tweets.values(), 0)
n_republican_tweets = functools.reduce(lambda x, y: x + len(y), republican_tweets.values(), 0)

In [93]:
print("Democrat Tweets Scraped: %d, made by %d users" % (n_democrat_tweets, len(democrat_tweets)))
print("Republican Tweets Scraped: %d, made by %d users" %( n_republican_tweets, len(republican_tweets)))

Democrat Tweets Scraped: 110805, made by 790 users
Republican Tweets Scraped: 118425, made by 821 users


In [94]:
import pandas as pd

In [99]:
rows = [("democrat", tid, tweet) for tid, tweets in democrat_tweets.items() for tweet in tweets]
rows += [("republican", tid, tweet) for tid, tweets in republican_tweets.items() for tweet in tweets]

In [101]:
df = pd.DataFrame(columns=['party', 'twitter_id', 'tweet'], data=rows)

In [104]:
df.to_csv('data/us_tweets.csv')

# Data Limits
* Max 200 Tweets per user
* No Retweets or replies (Excluded after 200 tweet limit is imposed)

# Additional information
I'd like to separate people who are professional politicians from those that are just members of a political party.

In [110]:
def query_party_politicians(party):
    
    # Query Wikidata public endpoint
    # Results need to be paged 10000 at a time
    
    party_dict = {
        'democrats' : 'wd:Q29552',
        'republicans' : 'wd:Q29468'
    }
    
    party_curi = party_dict[party]
    
    query = """
        SELECT DISTINCT ?tid WHERE {
          ?person wdt:P102 %s;
                  wdt:P2002 ?tid;
                  wdt:P106 wd:Q82955.
                  
        }""" % party_curi
    
    params = {
        'query' : query,
        'format' : 'json'
    }
    result = requests.get("https://query.wikidata.org/sparql", params = params)
    
    return result

In [112]:
democrat_politicians = query_party_politicians('democrats')
republican_politicians = query_party_politicians('republicans')

In [121]:
democrat_politician_tids = set([row['tid']['value'] for row in democrat_politicians.json()['results']['bindings']])
republican_politicians_tid = set([row['tid']['value'] for row in republican_politicians.json()['results']['bindings']])

In [122]:
len(democrat_tids), len(democrat_politician_tids)

(834, 651)

In [123]:
len(republican_tids), len(republican_politicians_tid)

(890, 653)

# Separate Non-Politicians

In [137]:
df_annotated = df.copy()

In [138]:
df_annotated['politician'] = df['twitter_id'].apply(lambda x: x in republican_politicians_tid or x in democrat_politician_tids)

In [140]:
df_politicians = df_annotated[df_annotated['politician']]

df_politicians = df_politicians.drop('politician', 1)

df_politicians.to_csv('data/us_politician_tweets.csv')

# Comment for Future
Since I ended up only caring about professional politicians, it would have been much neater to use this query from the very begining: