## Scraping Twitter 

In [1]:
%matplotlib inline

import oauth2
from twython import Twython
import simplejson
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests
import datetime
import json


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import sys

import unittest, time, re

## API Method

The API Method is good, but only gives us very recent twitter data. Below is an example of the type of code we would use to interact with the API

In [8]:
#APP_KEY = "qtmevmQ18N1vyWTAXfxqmh4oN"
#APP_SECRET = "MdZibormo3teZPTfMyeLEcuzMURHYidArOml0GtOQyrl6dI13R"

#access_token = '2694571580-Y8DsMjB0iMTGmm3Pwpo6IL3enhhFdAZQSXDIxO8'
#access_secret = 'AYciwyU197r6adpNziDT8pB0tmT3bKIihMrx7SPfbofRO'

#twitter = Twython(APP_KEY, APP_SECRET, access_token, access_secret)
#search_results = oauth_req('https://api.twitter.com/1.1/statuses/home_timeline.json', \
#                          access_token, access_secret)

#for tweet in search_results["statuses"]:
#    print tweet["text"]

#Define Twitter GET function using OAUTH2
#Function from https://dev.twitter.com/oauth/overview/single-user
#def oauth_req(url, key, secret, http_method="GET", post_body="", http_headers=None):
#    consumer = oauth2.Consumer(key=APP_KEY, secret=APP_SECRET)
#    token = oauth2.Token(key=key, secret=secret)
#    client = oauth2.Client(consumer, token)
#    resp, content = client.request( url, method=http_method, body=post_body, headers=http_headers )
#    return content

## Manual Web Scrape of Twitter

Manual Scraping of Twitter presents two challenges:

1) Twitter uses JavaScript for interactive webpage scrolling. If a search produces multiple results, once a reader gets to the end of a page, instead of being prompted with a "Next Page" link, twitter automatically queries it's JSON backend and dynamically loads the page. 

To work around this issue, we use the package Selenium which mimics "scrolling" the webpage for us. After scrolling though a set number of pages, we extract the HTML from the page, as suffient XHR requests have been made by Twitter.

2) Manual page data is not in nice JSON format, so we must use html parsing to get at the data.

Since we are interested in the positive/negative vibes of a tweet, we use Twitter's sentiment analysis in our search queries for a particular contestant. Then all we need to do is count the number of tweet tags that we scraped.


### Step 1: Create Function to Scrape Twitter

In [2]:
#We borrow heavily from http://stackoverflow.com/questions/12519074/scrape-websites-with-infinite-scrolling
def scrape_page(since, until, is_happy, contestant, \
                base_url="https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor", \
                pages_to_scroll=3, ):
    
    #### Initiate Chrome Browser #######
    #Must download ChromeDriver executable from https://sites.google.com/a/chromium.org/chromedriver/downloads
    driver = webdriver.Chrome('/Users/dcusworth/chrome_driver/chromedriver') #Specify location of driver
    driver.implicitly_wait(30)
    verificationErrors = []
    accept_next_alert = True

    #Create URL that will get the text
    ender = "&src=typd"
    if is_happy:
        sentiment = "%20%3A)"
    else:
        sentiment = "%20%3A("
    
    since_time = "%20since%3A" + str(since)
    until_time = "%20until%3A" + str(until)
    contestant_name = "%20" + contestant    
        
    final_url = base_url + contestant_name + sentiment + since_time + until_time + ender
    print final_url
    
    #Jump onto the webpage and scroll down
    delay = 3
    driver.get(final_url)
    for i in range(1,pages_to_scroll):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(4)
    html_source = driver.page_source
    
    #After scrolling enough times, get the text of the page
    data = html_source.encode('utf-8')
    driver.quit()

    return data



### Step 2: Retrive Data

We load in scraped Wikipedia Data that gives us a contestant's name and dates they appeared on the Bachelor. For each season/contestant pair, we create a dataframe of episode date, positive tweets, and negative tweets.

In [3]:
#Load Contestant Name Data from wiki scrape
with open("tempdata/seasonsDict.json") as json_file:
    wiki_data = json.load(json_file)


In [4]:
#Scrape Web to find the airdates of each episode
#Use http://epguides.com/Bachelor/
sdat = requests.get("http://epguides.com/Bachelor/")

#Parse through Beautriful Soup
ssoup = BeautifulSoup(sdat.text, "html.parser")

#Get all episode text in rows
row_text = ssoup.find_all("pre")[0]

uurls = []
ep_nam = []
for r in row_text.find_all("a"):
    if "Week" in r.get_text():
        uurls.append(r.get("href"))
        ep_nam.append(r.get_text())
        
#Fix Season 19 episode problems
ep_nam[140:] = [ee + " (S19)" for ee in ep_nam[140:]]


In [5]:
good_dates = []

for uurl in uurls:
    time.sleep(1)
    #Open up subpages
    subpage = requests.get(uurl)
    soup2 = BeautifulSoup(subpage.text, "html.parser")
    
    #Find box with date in it
    pars = soup2.find_all("br")
    pp = pars[0].get_text().split()
    pind = ["Airdate" in d for d in pp]
    
    #Convert date from page into usable date
    date_string = "-".join(pp[np.where(pind)[0]+1: np.where(pind)[0]+4])
    date_string = re.sub(",", "",date_string)
    date_object = datetime.datetime.strptime(date_string, "%b-%d-%Y")
    good_dates.append(date_object.strftime("%Y-%m-%d"))



In [6]:
#Extract the Season Number
season_num = []
for ee in ep_nam:
    start_string = ee.find("(")
    season_num.append(int(ee[(start_string+2):(len(ee)-1)]))

#Count up Episode Numbers
ep_num = []
start_val = 0
season_start = 1
for i in range(len(season_num)-1):
    if season_num[i] == season_start:
        start_val += 1
        ep_num.append(start_val)
    else:
        season_start += 1
        start_val = 1
        ep_num.append(start_val)

ep_num.append(ep_num[-1] + 1)

#Put Season / Episodes / Dates into a Pandas Dataframe
date_guide = pd.concat([pd.Series(season_num, name="Season"), pd.Series(ep_num, name="Episode"), \
                        pd.Series(good_dates,name="Date")], axis=1)


In [74]:
#Use Date Guide + Wiki info to set up inputs to scrape_page
#For a given Season, get all contestant names
#For each contestant find how many episodes they were on (minus their elimination episode)
#For each episode, count positive / negative tweets they received
#Output a dictionary with the Season as Key, and a dictionary of of each contestant's pos/neg splits as values

def scrape_season_tweets(season):
    
    season_dat = wiki_data[str(season)]
    all_eps = date_guide[date_guide.Season == season]
    result_dict = {}
    
    for sd in season_dat:
        #Get contestant's name
        cnam = sd["name"]          
        
        if len(cnam.split(">")) > 1:
            cnam2 = cnam.split(">")[1]
            contestant = cnam2.encode("utf-8").split(" ")[0]
        else:
            contestant = cnam.encode("utf-8").split(" ")[0]
        
        for ch in ["[", "]", "u\"","<",">"]:
            contestant = contestant.replace(ch, "")

        #Find week they are elminated, and then select weeks to run scraper
        elim = sd['eliminated']
        if ("Win" in elim) | ("Run" in elim):
            elim_week = all_eps.shape[0] - 1
            eweek = all_eps.iloc[0:elim_week]
            use_dats = eweek["Date"]
        else:
            elim_week = int(elim[(len(elim)-1):len(elim)]) - 1
            eweek = all_eps.iloc[0:elim_week]
            use_dats = eweek["Date"]

        dats = [datetime.datetime.strptime(idate, '%Y-%m-%d') for idate in use_dats]

        #For each date, run scraper, save in dictionary
        ep_dict = []
        if len(dats)==0 | ("href" in contestant):
            result_dict[contestant] = None
        else:
            for run_date in dats:
                #Make time range
                start_time = run_date +  datetime.timedelta(days=-1)
                end_time = run_date +  datetime.timedelta(days=2)

                #Find all positive tweets
                happy_time = scrape_page(since=start_time.strftime('%Y-%m-%d'), until=end_time.strftime('%Y-%m-%d'), \
                                         is_happy=True, contestant=contestant)
                soup = BeautifulSoup(happy_time, "html.parser")
                happy_tweets = len(soup.find_all("p", attrs={"class": "TweetTextSize"}))

                #Find all sad tweets
                sad_time = scrape_page(since=start_time.strftime('%Y-%m-%d'), until=end_time.strftime('%Y-%m-%d'), \
                                         is_happy=False, contestant=contestant)
                soup = BeautifulSoup(sad_time, "html.parser")
                sad_tweets = len(soup.find_all("p", attrs={"class": "TweetTextSize"}))

                print contestant
                print run_date.strftime('%Y-%m-%d')
                print happy_tweets
                print sad_tweets
                #Save the results to a dictionary
                ep_dict.append({run_date.strftime('%Y-%m-%d'):{"happy":happy_tweets, "sad":sad_tweets}})
            result_dict[contestant] = ep_dict

    return result_dict

## Run scraping code individually for each season

In [40]:
tweets13 = scrape_season_tweets(13)

https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20a%20%3A)%20since%3A2009-01-04%20until%3A2009-01-07&src=typd
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20a%20%3A(%20since%3A2009-01-04%20until%3A2009-01-07&src=typd
a
2009-01-05
0
0
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20a%20%3A)%20since%3A2009-01-11%20until%3A2009-01-14&src=typd
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20a%20%3A(%20since%3A2009-01-11%20until%3A2009-01-14&src=typd
a
2009-01-12
0
0
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20a%20%3A)%20since%3A2009-01-18%20until%3A2009-01-21&src=typd
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20a%20%3A(%20since%3A2009-01-18%20until%3A2009-01-21&src=typd
a
2009-01-19
0
0
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20a%20%3A)%20since%3A2009-01-25%20until%3A2009-01-28&src=typd
https://twitter.com/searc

In [75]:
tweets14 = scrape_season_tweets(14)

https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20Vienna%20%3A)%20since%3A2010-01-03%20until%3A2010-01-06&src=typd
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20Vienna%20%3A(%20since%3A2010-01-03%20until%3A2010-01-06&src=typd
Vienna
2010-01-04
0
0
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20Vienna%20%3A)%20since%3A2010-01-10%20until%3A2010-01-13&src=typd
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20Vienna%20%3A(%20since%3A2010-01-10%20until%3A2010-01-13&src=typd
Vienna
2010-01-11
0
0
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20Vienna%20%3A)%20since%3A2010-01-17%20until%3A2010-01-20&src=typd
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20Vienna%20%3A(%20since%3A2010-01-17%20until%3A2010-01-20&src=typd
Vienna
2010-01-18
5
0
https://twitter.com/search?f=tweets&vertical=default&q=%23thebachelor%20Vienna%20%3A)%20since%3A2010-01-24%20unt

In [None]:
import json
with open('data.json', 'w') as fp:
    json.dump(data, fp)

In [70]:
#Get contestant's name
season_dat = wiki_data['19']

for sd in season_dat:
    cnam = sd["name"]          

    if len(cnam.split(">")) > 1:
        cnam2 = cnam.split(">")[1]
        contestant = cnam2.encode("utf-8").split(" ")[0]
    else:
        contestant = cnam.encode("utf-8").split(" ")[0]

    for ch in ["[", "]", "u\"","<",">"]:
        contestant = contestant.replace(ch, "")

    print contestant

Whitney
Becca
Kaitlyn
Jade
Carly
Britt
Megan
Kelsey
Ashley
Mackenzie
Samantha
Ashley
Juelia
Nikki
Jillian
Amber
Tracy
Trina
Alissa
Jordan
Kimberly
Tandra
Tara
Amanda
Bo
Brittany
Kara
Michelle
Nicole
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jehan
Jeh

In [71]:
wiki_data['19']

[{u'age': u'29',
  u'eliminated': u'Winner',
  u'hometown': u'Louisville, Kentucky',
  u'name': u'Whitney Bischoff</b',
  u'occupation': u'Fertility Nurse',
  u'season': 19},
 {u'age': u'26',
  u'eliminated': u'Runner-up',
  u'hometown': u'Shreveport, Louisiana',
  u'name': u'Becca Tilley',
  u'occupation': u'Chiropractic Assistant',
  u'season': 19},
 {u'age': u'29',
  u'eliminated': u'Eliminated in week 9',
  u'hometown': u'Leduc, Alberta',
  u'name': u'Kaitlyn Bristowe',
  u'occupation': u'Dance Instructor',
  u'season': 19},
 {u'age': u'28',
  u'eliminated': u'Eliminated in week 8',
  u'hometown': u'Gering, Nebraska',
  u'name': u'Jade Roper',
  u'occupation': u'Cosmetics Developer',
  u'season': 19},
 {u'age': u'29',
  u'eliminated': u'Eliminated in week 7',
  u'hometown': u'Arlington, Texas',
  u'name': u'Carly Waddell',
  u'occupation': u'Cruise Ship Singer',
  u'season': 19},
 {u'age': u'27',
  u'eliminated': u'Eliminated in week 7',
  u'hometown': u'Hollywood, California',
  u