# Extracting tweets

## Twitter API

In [1]:
from twitter.oauth import write_token_file, read_token_file
from twitter.oauth_dance import oauth_dance
import os
import twitter
from credentials import *

In [2]:
token, token_secret = oauth_dance(APP_NAME, CONSUMER_KEY, CONSUMER_SECRET)

app = twitter.Twitter(domain='api.twitter.com', api_version='1.1',
                      auth=twitter.oauth.OAuth(token, token_secret,
                                               CONSUMER_KEY, CONSUMER_SECRET))

Hi there! We're gonna get you all set up to use VoterLitmus.

In the web browser window that opens please choose to Allow
access. Copy the PIN number that appears on the next page and paste or
type it here:

Opening: https://api.twitter.com/oauth/authorize?oauth_token=eXjdWAAAAAAA0TOZAAABW-sD3XA

Please enter the PIN: 1444421


In [3]:
# search_results = app.search.tweets(q="Trump", count=100, lang='en', since="2016-11-15")
# search_results = app.search.tweets(q="Trump", count=100, lang='en', max_id=856600000000000000)
search_results = app.search.tweets(q="Trump", count=100, lang='en')
statuses = search_results['statuses']
print(len(statuses))

100


In [4]:
statuses[0]['user']

{u'contributors_enabled': False,
 u'created_at': u'Fri Jan 18 01:59:17 +0000 2013',
 u'default_profile': True,
 u'default_profile_image': False,
 u'description': u'',
 u'entities': {u'description': {u'urls': []}},
 u'favourites_count': 39063,
 u'follow_request_sent': False,
 u'followers_count': 596,
 u'following': False,
 u'friends_count': 1001,
 u'geo_enabled': True,
 u'has_extended_profile': False,
 u'id': 1099771423,
 u'id_str': u'1099771423',
 u'is_translation_enabled': False,
 u'is_translator': False,
 u'lang': u'en',
 u'listed_count': 46,
 u'location': u'',
 u'name': u'illegalPresidentDT!!',
 u'notifications': False,
 u'profile_background_color': u'C0DEED',
 u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme1/bg.png',
 u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme1/bg.png',
 u'profile_background_tile': False,
 u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/1099771423/1475349037',
 u'profile_image_url': 

## API-less way

In [5]:
from bs4 import BeautifulSoup
import requests
import json

In [6]:
headers = {
    'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:54.0) Gecko/20100101 Firefox/54.0"
}

geo = "New York, NY"
max_id = 300252000000000000

def make_url(geo, max_id, opt_query=""):
    """construct url for geo search
    geo: City, State
    opt_q: optional queries like since, until
    scroll: simulate infinite scrolling
    """
    scroll = "BD1UO2FFu9QAAAAAAAAETAAAAA"
    base_url = "https://twitter.com/i/search/timeline?f=realtime"
    
    if opt_query:
        opt_query = " " + opt_query
        
    url = (base_url
           + "&q=near:\"{}\"{}".format(geo, opt_query)
           + "&max_position=TWEET-{}-{}-{}".format(max_id, max_id + 1, scroll)
           + "&src=typd&include_entities=1")
    return url

In [7]:
from __future__ import division
from collections import Counter

In [8]:
# content = requests.get(url, headers=headers).json()

# ID_STEPSIZE = 200000000000
# incr = 50000000000
# id_steps = [ID_STEPSIZE + incr * i for i in range(4)]
# max_id = 300252000000000000
# city = "New York, NY"


def benchmark_params(query, max_id, scale_factors=(1,5), nsteps=4, nsets=4):
    
    if type(query) == str:
        query = (query, "")

    f1, f2 = scale_factors
    id_range = int(100000000000 * f1)
    increment = int(10000000000 * f2)
    
    id_steps = [id_range + increment * i for i in range(nsteps)]
    
    for si, step in enumerate(id_steps):

        all_tweets = []
        all_tweet_ids = []

        for _ in range(nsets):
            url = make_url(geo=query[0], max_id=max_id, opt_query=query[1])

            req = requests.get(url, headers=headers)
            json_text = json.loads(req.text)
            soup = BeautifulSoup(json_text['items_html'], "html5lib")

            tweets = soup.find_all('div', attrs={'class': 'tweet'})

            # process tweets and insert data into database here and remove all_tweets
            all_tweets = all_tweets + tweets

            tweet_ids = [tw['data-tweet-id'] for tw in tweets]
            all_tweet_ids = all_tweet_ids + tweet_ids

            max_id = sorted(map(int, tweet_ids))[-1] + step

            # tweet_texts = [tw.find('p', attrs={'class': 'tweet-text'}).text for tw in tweets]
            # print("Batch {}: Saved {} tweets".format(i, len(tweet_ids)))

        n_all = len(all_tweet_ids)
        n_unique = len(list(set(all_tweet_ids)))
        n_overlaps = n_all - n_unique
        
        if si == 0:
            print("City: " + query[0])
        
        print("==> Step size: {}: {} overlaps out of {} tweets ({:.2f}%)".\
                  format(step, n_overlaps, n_all, 100 * n_overlaps / n_all))
        
    return all_tweets
    #return [twt for twts in all_tweets for twt in twts]

## Some experiments

`data-tweet-id` is a universal identifier of tweets. The more the city is populated, the more tweets it generates. We need to give big enough ID range to avoid too many overlaps for less populated cities.

### Identifying location
* Read this paper: [Home Location Identification of Twitter Users](https://arxiv.org/abs/1403.2345)

### To-do
* Plot Tweet ID vs. timestamp so we (approximately) know which id we should be looking at for the desired time range.
* Consider varying lower and upper ID bounds depending on the population and time of the day.

In [9]:
max_id = 274000000000000000  # 2012-11
twts = benchmark_params("Columbus, OH", max_id, (6, 5), 5, 5)

City: Columbus, OH
==> Step size: 600000000000: 13 overlaps out of 94 tweets (13.83%)
==> Step size: 650000000000: 3 overlaps out of 87 tweets (3.45%)
==> Step size: 700000000000: 0 overlaps out of 94 tweets (0.00%)
==> Step size: 750000000000: 0 overlaps out of 94 tweets (0.00%)
==> Step size: 800000000000: 0 overlaps out of 96 tweets (0.00%)


In [10]:
max_id = 263110000000000000  # 2012-11

def query_tweets(city, max_id, nbatch):
    twts = benchmark_params(city, max_id, (10, 0), 1, 10)
    id_time_tups = [(int(t['data-tweet-id']), t.find('a', attrs={'class': 'tweet-timestamp'})['title'])
                    for t in twts]
    sorted_tups = sorted(id_time_tups, key=lambda x: x[0])
    print(sorted_tups[0])
    print(sorted_tups[-1])
    return twts

In [11]:
twts_cum = []
twts_cum.append(query_tweets("Columbus, OH", 263060000000000000, 10))
twts_cum.append(query_tweets("Columbus, OH", 263070000000000000, 10))
twts_cum.append(query_tweets("Columbus, OH", 263080000000000000, 10))
twts_cum.append(query_tweets("Columbus, OH", 263090000000000000, 10))

City: Columbus, OH
==> Step size: 1000000000000: 16 overlaps out of 199 tweets (8.04%)
(263058610525974530, u'4:23 PM - 29 Oct 2012')
(263068399150710784, u'5:02 PM - 29 Oct 2012')
City: Columbus, OH
==> Step size: 1000000000000: 33 overlaps out of 181 tweets (18.23%)
(263069109284126720, u'5:05 PM - 29 Oct 2012')
(263078516386570240, u'5:43 PM - 29 Oct 2012')
City: Columbus, OH
==> Step size: 1000000000000: 2 overlaps out of 182 tweets (1.10%)
(263079190230880256, u'5:45 PM - 29 Oct 2012')
(263088597874397184, u'6:23 PM - 29 Oct 2012')
City: Columbus, OH
==> Step size: 1000000000000: 2 overlaps out of 177 tweets (1.13%)
(263089387032674304, u'6:26 PM - 29 Oct 2012')
(263098644004216833, u'7:02 PM - 29 Oct 2012')


In [12]:
# flatten lists of tweets
flattened = [(int(t['data-tweet-id']), t) for twts in twts_cum for t in twts]

# get unique tweets
seen = set()
unique = []
for t in flattened:
    if t[0] not in seen:
        seen.add(t[0])
        unique.append(t)

srted = sorted(unique, key=lambda x: x[0])
srted[0]

(263058610525974530,
 <div class="tweet js-stream-tweet js-actionable-tweet js-profile-popup-actionable dismissible-content original-tweet js-original-tweet " data-component-context="tweet" data-conversation-id="263058610525974530" data-disclosure-type="" data-follows-you="false" data-item-id="263058610525974530" data-name="Ryan Sefcik" data-permalink-path="/RyanSefcik/status/263058610525974530" data-reply-to-users-json='[{"id_str":"355165726","screen_name":"RyanSefcik","name":"Ryan Sefcik","emojified_name":{"text":"Ryan Sefcik","emojified_text_as_html":"Ryan Sefcik"}}]' data-screen-name="RyanSefcik" data-tweet-id="263058610525974530" data-tweet-nonce="263058610525974530-8deda78b-baaf-4745-87ae-4e8173a71b30" data-tweet-stat-initialized="true" data-user-id="355165726" data-you-block="false" data-you-follow="false">\n\n    <div class="context">\n      \n      \n    </div>\n\n    <div class="content">\n      \n\n      \n\n      \n      <div class="stream-item-header">\n          <a class=

In [None]:
# regenerate tweet timeline
with open('test.html', 'w') as f:
    f.write('\n'.join(map(lambda x: x[1].prettify().encode('utf-8'), srted)))

In [None]:
# benchmark_params("Los Angeles, CA", max_id, (3, 2))
# # test optional arguments
# twts = benchmark_params(("Los Angeles, CA", "until:\"2013-01-23\""), max_id, (1, 1), 1, 1)
# print('\n'.join([t.find('a', attrs={'class': 'tweet-timestamp'})['title'] for t in twts]))