# Measuring Jason Chaffetz' Legacy on Twitter

By [Sean McMinn](http://www.github.com/shmcminn)

As Utah Republican Rep. Jason Chaffetz prepared to retire on June 30, 2017, Roll Call used this script to parse his most recent tweets, up to the Twitter API cap, then analyze them. Roll Call published the [graphic story](http://www.rollcall.com/news/politics/measuring-chaffetzs-legacy-twitter) using this analaysis on June 29. 

In [6]:
#!/usr/bin/env python
# encoding: utf-8

import tweepy #https://github.com/tweepy/tweepy
import csv
import datetime
from bs4 import BeautifulSoup
import requests
import time

from collections import Counter

#Twitter API credentials
consumer_key = "INSERT_HERE"
consumer_secret = "INSERT_HERE"
access_key = "INSERT_HERE"
access_secret = "INSERT_HERE"

In [7]:
#function from https://gist.github.com/yanofsky/5436496

def get_all_tweets(screen_name):
	#Twitter only allows access to a users most recent 3240 tweets with this method
	
	#authorize twitter, initialize tweepy
	auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
	auth.set_access_token(access_key, access_secret)
	api = tweepy.API(auth)
	
	#initialize a list to hold all the tweepy Tweets
	alltweets = []	
	
	#make initial request for most recent tweets (200 is the maximum allowed count)
	new_tweets = api.user_timeline(screen_name = screen_name,count=200)
	
	#save most recent tweets
	alltweets.extend(new_tweets)
	
	#save the id of the oldest tweet less one
	oldest = alltweets[-1].id - 1
	
	#keep grabbing tweets until there are no tweets left to grab
	while len(new_tweets) > 0:
		print("getting tweets before %s" % (oldest))
		
		#all subsiquent requests use the max_id param to prevent duplicates
		new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
		
		#save most recent tweets
		alltweets.extend(new_tweets)
		
		#update the id of the oldest tweet less one
		oldest = alltweets[-1].id - 1
		print("...%s tweets downloaded so far" % (len(alltweets)))
	
	#transform the tweepy tweets into a 2D array that will populate the csv	
	outtweets = [[tweet.id_str, tweet.created_at, tweet.text, tweet.retweet_count, tweet.favorite_count] for tweet in alltweets]
	
	#write the csv	
	with open('jasoninthehouse_tweets.csv', 'w') as f:
		writer = csv.writer(f)
		writer.writerow(["id","created_at","text", "RT", "fav"])
		writer.writerows(outtweets)
	
	pass

# Get  recent tweets

In [8]:
if __name__ == '__main__':
	#pass in the username of the account you want to download
	get_all_tweets("jasoninthehouse")

getting tweets before 836422433443807231
...400 tweets downloaded so far
getting tweets before 804657644606816260
...600 tweets downloaded so far
getting tweets before 769394871601926143
...799 tweets downloaded so far
getting tweets before 740978125165756415
...999 tweets downloaded so far
getting tweets before 715303205580750847
...1199 tweets downloaded so far
getting tweets before 691767066131566591
...1399 tweets downloaded so far
getting tweets before 670034126716559360
...1599 tweets downloaded so far
getting tweets before 643578898274385920
...1798 tweets downloaded so far
getting tweets before 620452746961879039
...1998 tweets downloaded so far
getting tweets before 596076664104189953
...2198 tweets downloaded so far
getting tweets before 570626673584021503
...2398 tweets downloaded so far
getting tweets before 548644361232453631
...2598 tweets downloaded so far
getting tweets before 516262623063457792
...2798 tweets downloaded so far
getting tweets before 489611165535059967
.

# Read local CSV of tweets downloaded

In [9]:
# open csv file as list of dicts

with open('jasoninthehouse_tweets.csv') as f:
    all_tweets = [{k: v for k, v in row.items()}
        for row in csv.DictReader(f, skipinitialspace=True)]

# Parse and clean data

note: for "weekday" key, 1 = Mon and 7 = Sun

In [10]:
# make changes to tweet data

for twt in all_tweets:
    time_minus_four = (datetime.datetime.strptime(twt["created_at"], "%Y-%m-%d %H:%M:%S")- datetime.timedelta(hours=4))
    twt["new_date"] = str(time_minus_four.date())
    twt["new_time"] = str(time_minus_four.time())
    twt["hour"] = twt["new_time"][0:2]
    twt["weekday"] = time_minus_four.weekday()
    if twt["weekday"] is 7:
        twt["weekday"] = 0
    twt["weekday"] = twt["weekday"] + 1
    twt["text"] = twt["text"].replace("b'","")
    twt["text"] = twt["text"].replace('b"',"")

In [11]:
# write csv from new tweet data

out_file_name = "jasoninthehouse_tweets.csv"

with open(out_file_name, "w") as csvfile:	
	fieldnames = all_tweets[0].keys()
	writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
	writer.writeheader()
	for el in all_tweets:
		writer.writerow(el)

# Find most popular tweets

## Most retweets

In [18]:
# get top 10 RT'd tweets
# be careful on analyzing - some of these are others' tweets Chaffetz RT'd. Could use function to remove tweets that start with "RT"

most_rts =  sorted(all_tweets, key=lambda k: int(k['RT']), reverse=True)[0:10]

most_rts[0:10]

[{'RT': '41327',
  'created_at': '2017-06-15 02:00:34',
  'fav': '0',
  'hour': '22',
  'id': '875171141605785600',
  'new_date': '2017-06-14',
  'new_time': '22:00:34',
  'text': 'RT @realDonaldTrump: Just left hospital. Rep. Steve Scalise, one of the truly great people, is in very tough shape - but he is a real fight…',
  'weekday': 3},
 {'RT': '21829',
  'created_at': '2016-10-28 16:57:17',
  'fav': '29685',
  'hour': '12',
  'id': '792047597040971776',
  'new_date': '2016-10-28',
  'new_time': '12:57:17',
  'text': 'FBI Dir just informed me, "The FBI has learned of the existence of emails that appear to be pertinent to the investigation." Case reopened',
  'weekday': 5},
 {'RT': '11543',
  'created_at': '2016-11-06 20:15:36',
  'fav': '10758',
  'hour': '16',
  'id': '795358997083586560',
  'new_date': '2016-11-06',
  'new_time': '16:15:36',
  'text': 'FBI Dir just informed us "Based on our review, we have not changed our conclusions that we expressed in July with respect to Sec Cl

## Tweets with most likes

In [19]:
# get top 10 fav'd tweets

most_favs =  sorted(all_tweets, key=lambda k: int(k['fav']), reverse=True)[0:10]

most_favs[0:10]

[{'RT': '21829',
  'created_at': '2016-10-28 16:57:17',
  'fav': '29685',
  'hour': '12',
  'id': '792047597040971776',
  'new_date': '2016-10-28',
  'new_time': '12:57:17',
  'text': 'FBI Dir just informed me, "The FBI has learned of the existence of emails that appear to be pertinent to the investigation." Case reopened',
  'weekday': 5},
 {'RT': '6162',
  'created_at': '2017-05-16 22:58:45',
  'fav': '14279',
  'hour': '18',
  'id': '864616135466950656',
  'new_date': '2017-05-16',
  'new_time': '18:58:45',
  'text': '.@GOPoversight is going to get the Comey memo, if it exists. I need to see it sooner rather than later. I have my subpoena pen ready.',
  'weekday': 2},
 {'RT': '11543',
  'created_at': '2016-11-06 20:15:36',
  'fav': '10758',
  'hour': '16',
  'id': '795358997083586560',
  'new_date': '2016-11-06',
  'new_time': '16:15:36',
  'text': 'FBI Dir just informed us "Based on our review, we have not changed our conclusions that we expressed in July with respect to Sec Clinto

# Find out what he's tweeting about

In [21]:
# get top 100 word freq count

all_words = ""
for twt in all_tweets:
    all_words = all_words + " " + twt["text"]
    
words_freq =Counter(all_words.lower().replace(":","").split()).most_common()
    
most_used_words = Counter(all_words.lower().split()).most_common()[0:200]

## Get most used hashtags

In [22]:
# get most used hashtags

hashtags_freq = []

most_used_hashtags = []

for word in words_freq:
    if word[0][0] == "#":
        hashtags_freq.append(word)
        if word[1] > 3:
            most_used_hashtags.append(word)
            
# fix utah

for ind, ht in enumerate(most_used_hashtags):
    if ht[0] == "#utah…":
        add_utah_num = ht[1]
        most_used_hashtags.pop(ind)
        

for ind,ht in enumerate(most_used_hashtags):
    if ht[0] == "#utah":
        most_used_hashtags[ind] = ["#utah", ht[1]+add_utah_num]
  
for ht in most_used_hashtags:
    print(ht[0] + " - " + str(ht[1]))

#benghazi - 38
#tbt - 35
#utah - 34
#utpol - 29
#irs - 26
#tcot - 20
#fastandfurious - 14
#ut3rddistrict - 14
#irsfail - 11
#usa - 9
#secretservice - 8
#neverforget - 7
#breaking - 7
#emerycounty - 7
#nomidnightmonument - 6
#epa - 6
#nature - 5
#july4 - 5
#americanforkcanyon - 4
#taxday - 4
#oneluckyguy - 4
#plannedparenthood - 4
#byufootball - 4
#gopoversight - 4
#natgeo - 4
#sxsw - 4


## Get users he tweets @ the most

In [23]:
# get most used ats

ats_freq = []

most_used_ats = []

for word in words_freq:
    if word[0][0] == "@" and word[0] != "@":
        ats_freq.append(word)
        if word[1] > 14:
            most_used_ats.append(word)
  
for at in most_used_ats:
    print(at[0] + " - " + str(at[1]))

@gopoversight - 505
@foxnews - 166
@youtube - 51
@jasoninthehouse - 49
@politico - 41
@kslnewsradio - 38
@cnn - 37
@sltrib - 31
@speakerryan - 30
@usatoday - 27
@wsj - 25
@washingtonpost - 24
@greta - 23
@abc - 22
@kslcom - 21
@dailycaller - 21
@dougwrightshow - 19
@tgowdysc - 16
@wolfblitzer - 15
@speakerboehner - 15


# Get Twitter data on other members who entered House at same time as Chaffetz ('09)

We used this to see if Chaffetz was the most active among his classmates on Twitter. He was.

Twitter usernames compiled by [congress-legislators](https://github.com/unitedstates/congress-legislators)

In [24]:
# scrape twitter vitals for chaffetz' class in House


member_uns = ["JasonInTheHouse",
"RepMikeCoffman",
"RepGuthrie",
"GreggHarper",
"Rep_Hunter",
"RepLynnJenkins",
"RepLanceNJ7",
"RepBlainePress",
"RepMcClintock",
"RepPeteOlson",
"RepErikPaulsen",
"CongBillPosey",
"DrPhilRoe",
"RepRooney",
"CongressmanGT"]

class_data = []

url_prefix = "http://www.twitter.com/"

for un in member_uns:
    resp = requests.get(url_prefix + un)
    html = resp.text
    soup = BeautifulSoup(html,"html.parser")
    vitals = soup.find_all("a", {"class":"ProfileNav-stat"})
    d = {}
    for vital in vitals:
        d[vital.find_all("span")[0].getText()] = vital.find_all("span")[2].getText()  
    d["un"] = un
    class_data.append(d)
    print("finished " + un)
    time.sleep(2) #seconds
    
longest_vital = [0,0]    

for ind, member in enumerate(class_data):
    member["Tweets"] = member["Tweets"].replace("\n","").strip()
    if len(member.keys()) > longest_vital[1]:
        longest_vital[0] = ind
        longest_vital[1] = len(member.keys())
    
    
out_file_name = "class_09_twitter_vitals.csv"

with open(out_file_name, "w") as csvfile:
    fieldnames = class_data[longest_vital[0]].keys()
    writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
    writer.writeheader()
    for el in class_data:
        writer.writerow(el)

finished JasonInTheHouse
finished RepMikeCoffman
finished RepGuthrie
finished GreggHarper
finished Rep_Hunter
finished RepLynnJenkins
finished RepLanceNJ7
finished RepBlainePress
finished RepMcClintock
finished RepPeteOlson
finished RepErikPaulsen
finished CongBillPosey
finished DrPhilRoe
finished RepRooney
finished CongressmanGT
