## Scraping Twitter API

In [36]:
%matplotlib inline

import oauth2
from twython import Twython
import simplejson
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests
import datetime

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import sys

import unittest, time, re

## API Method

The API Method is good, but only gives us very recent twitter data. Below is an example of the type of code we would use to interact with the API

In [8]:
#APP_KEY = "qtmevmQ18N1vyWTAXfxqmh4oN"
#APP_SECRET = "MdZibormo3teZPTfMyeLEcuzMURHYidArOml0GtOQyrl6dI13R"

#access_token = '2694571580-Y8DsMjB0iMTGmm3Pwpo6IL3enhhFdAZQSXDIxO8'
#access_secret = 'AYciwyU197r6adpNziDT8pB0tmT3bKIihMrx7SPfbofRO'

#twitter = Twython(APP_KEY, APP_SECRET, access_token, access_secret)
#search_results = oauth_req('https://api.twitter.com/1.1/statuses/home_timeline.json', \
                          access_token, access_secret)

#for tweet in search_results["statuses"]:
#    print tweet["text"]

#Define Twitter GET function using OAUTH2
#Function from https://dev.twitter.com/oauth/overview/single-user
#def oauth_req(url, key, secret, http_method="GET", post_body="", http_headers=None):
#    consumer = oauth2.Consumer(key=APP_KEY, secret=APP_SECRET)
#    token = oauth2.Token(key=key, secret=secret)
#    client = oauth2.Client(consumer, token)
#    resp, content = client.request( url, method=http_method, body=post_body, headers=http_headers )
#    return content

## Manual Web Scrape of Twitter

Manual Scraping of Twitter presents two challenges:

1) Twitter uses JavaScript for interactive webpage scrolling. If a search produces multiple results, once a reader gets to the end of a page, instead of being prompted with a "Next Page" link, twitter automatically queries it's JSON backend and dynamically loads the page. 

To work around this issue, we use the package Selenium which mimics "scrolling" the webpage for us. After scrolling though a set number of pages, we extract the HTML from the page, as suffient XHR requests have been made by Twitter.

2) Manual page data is not in nice JSON format, so we must use html parsing to get at the data.

Since we are interested in the positive/negative vibes of a tweet, we use Twitter's sentiment analysis in our search queries for a particular contestant. Then all we need to do is count the number of tweet tags that we scraped.


### Step 1: Create Function to Scrape Twitter

In [43]:
#We borrow heavily from http://stackoverflow.com/questions/12519074/scrape-websites-with-infinite-scrolling
def scrape_page(since, until, is_happy, contestant, \
                base_url="https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor", \
                pages_to_scroll=3, ):
    
    #### Initiate Chrome Browser #######
    #Must download ChromeDriver executable from https://sites.google.com/a/chromium.org/chromedriver/downloads
    driver = webdriver.Chrome('/Users/dcusworth/chrome_driver/chromedriver') #Specify location of driver
    driver.implicitly_wait(30)
    verificationErrors = []
    accept_next_alert = True

    #Create URL that will get the text
    ender = "&src=typd"
    if is_happy:
        sentiment = "%20%3A)"
    else:
        sentiment = "%20%3A("
    
    since_time = "%20since%3A" + str(since)
    until_time = "%20until%3A" + str(until)
    contestant_name = "%20" + contestant    
        
    final_url = base_url + contestant_name + sentiment + since_time + until_time + ender
    print final_url
    
    #Jump onto the webpage and scroll down
    delay = 3
    driver.get(final_url)
    for i in range(1,pages_to_scroll):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(4)
    html_source = driver.page_source
    
    #After scrolling enough times, get the text of the page
    data = html_source.encode('utf-8')
    driver.quit()

    return data



### Step 2: Retrive Data

We load in scraped Wikipedia Data that gives us a contestant's name and dates they appeared on the Bachelor. For each season/contestant pair, we create a dataframe of episode date, positive tweets, and negative tweets.

In [44]:
#Start with a test - Season 16, contestant Emily
contestant = "Emily"
episode_dates = ['2012-01-02', '2012-01-09', '2012-01-16', '2012-01-23', '2012-01-30', '2012-02-06']

dats = [datetime.datetime.strptime(idate, '%Y-%m-%d') for idate in episode_dates]

result_dict = {}
for run_date in dats:
    #Make time range
    start_time = run_date +  datetime.timedelta(days=-1)
    end_time = run_date +  datetime.timedelta(days=2)
    
    #Find all positive tweets
    happy_time = scrape_page(since=start_time.strftime('%Y-%m-%d'), until=end_time.strftime('%Y-%m-%d'), \
                             is_happy=True, contestant=contestant)
    soup = BeautifulSoup(happy_time, "html.parser")
    happy_tweets = len(soup.find_all("p", attrs={"class": "TweetTextSize"}))
    
    #Find all sad tweets
    sad_time = scrape_page(since=start_time.strftime('%Y-%m-%d'), until=end_time.strftime('%Y-%m-%d'), \
                             is_happy=False, contestant=contestant)
    soup = BeautifulSoup(sad_time, "html.parser")
    sad_tweets = len(soup.find_all("p", attrs={"class": "TweetTextSize"}))
    
    #Save the results to a dictionary
    ep_dict = {contestant:{"happy":happy_tweets, "sad":sad_tweets}}
    result_dict[run_date.strftime('%Y-%m-%d')] = ep_dict
    
print result_dict

https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20Emily%20%3A)%20since%3A2012-01-01%20until%3A2012-01-04&src=typd
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20Emily%20%3A(%20since%3A2012-01-01%20until%3A2012-01-04&src=typd
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20Emily%20%3A)%20since%3A2012-01-08%20until%3A2012-01-11&src=typd
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20Emily%20%3A(%20since%3A2012-01-08%20until%3A2012-01-11&src=typd
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20Emily%20%3A)%20since%3A2012-01-15%20until%3A2012-01-18&src=typd
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20Emily%20%3A(%20since%3A2012-01-15%20until%3A2012-01-18&src=typd
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20Emily%20%3A)%20since%3A2012-01-22%20until%3A2012-01-25&src=typd
https://twitter.com/search?f=tweets&vertical=def