# Scraping Twitter Advanced Search using Selenium and BeautifulSoup

In [1]:
from bs4 import BeautifulSoup
import urllib
import urllib.request
import requests
import re
from IPython import embed
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import StaleElementReferenceException

## 1. Create functions for scraping tasks

In [2]:
#keyword scraping approach, combined use of selenium and beautiful soup

#twit_url function will return correct url to target

#notes on expected format of inputs, strings
#date format: YYYY-MM-DD
#loc: Manila, Cebu, Davao
#radius: 150mi or 100mi
#function will read keyword as hashtag automatically (in respect of Twitter robots.txt file)

def twit_url(keyword,beg_date,end_date,loc,radius):
    targeting="http://twitter.com/search?q=%23" + keyword + "%20near%3A" + loc + "%20within%3A" + radius + "%20until%3A" + end_date +"%20since%3A"+ beg_date + "&f=live"
    return targeting

#trying to get all html content of twitter search page

#manual crawl function tries to collect all page info as you scroll down
#manual because we set number of scrolls
#limited by number of scrolls set, might not be end of page yet if set too low, say, below 100

def manual_crawl_page(url,n):
    #loading search page
    driver=webdriver.Chrome()
    driver.get(url)
    driver.implicitly_wait(15)
    
    #empty list where we will append all html content collected as we scroll down
    html=[]
    
    #scroll for n seconds
    for i in range(n):
        #search for all relevant contents of twitter search page
        list_items = driver.find_elements_by_class_name('css-1dbjc4n')
        #collecting html content
        html.append(list_items[0].get_attribute('innerHTML'))
        
        #to scroll down
        elem = driver.find_element_by_tag_name('body')
        elem.send_keys(Keys.END)
        
        time.sleep(np.random.randint(1,3))
    
    #parsing each html content collected
    soup=[]
    for i in html:    
        soup += BeautifulSoup(i,'lxml')
        
    return soup

#auto crawl function tries to collect all page info as it automatically scrolls down until end of page
#note: takes some time to finish, also extracts fewer tweets. might be due to pace of scrolling

def auto_crawl_page(url):
    #loading search page
    driver=webdriver.Chrome()
    driver.get(url)    
    
    time.sleep(np.random.randint(1,3))  
    
    #empty list where we will append all html content collected as we scroll down
    html=[]
    
    #calculates length of page
    lenpage = driver.execute_script("var lenpage=document.body.scrollHeight;return lenpage;")      
    
    #to iterate until bottom of page
    match=False
    while(match==False):
        #implicitly wait to load page
        driver.implicitly_wait(10)
        
        #search for all relevant contents of twitter search page
        list_items = driver.find_elements_by_class_name('css-1dbjc4n')
        #collecting html content
        html.append(list_items[0].get_attribute('innerHTML'))
        
        #re-assigns value of length of page
        lastcount = lenpage
        
        time.sleep(np.random.randint(10,30))
        
        #to scroll down
        elem = driver.find_element_by_tag_name('body')
        elem.send_keys(Keys.END)
        
        #re-calculates length of page
        lenpage = driver.execute_script("var lenpage=document.body.scrollHeight;return lenpage;")
        
        #criteria that evaluates if we are at the bottom of page already
        if lastcount==lenpage:
            match=True
    
    #parsing each html content collected
    soup=[]
    for i in html:    
        soup += BeautifulSoup(i,'lxml')
        
    return soup


#function extracting tweet details

def twitsrc_data(soup):
    #get all tweets from each parsed html
    tweets=[]
    for a in soup:
        temp=a.find_all('article')
        for b in temp:
            tweets.append(b)
    
    #empty lists where we will append tweet info
    author=[]
    username=[]
    post=[]
    timestamp=[]
    retweet=[]
    like=[]
    reply=[]

    #to iterate over tweets collected, if-else is to account for unexpected errors
    for i in tweets:
        #author
        if i.find('span')==None:
            author.append(np.nan)
        else:
            author.append(i.find('span').get_text()) #author
        
        #username
        if len(re.findall('\@(\w+)\W\d+',i.find('div',{'class':'css-1dbjc4n'}).get_text())) == 0:
            username.append(np.nan)
        else:
            username.append(re.findall('\@(\w+)\W\d+',i.find('div',{'class':'css-1dbjc4n'}).get_text())[0]) #username
        
        #post
        post.append(re.sub('(.*)\@(\w+)\W\d+h?\s?[M]?[a]?[r]?[y]?[A]?[p]?[r]?','',i.get_text())) #post
        
        #timestamp
        if i.find('time')==None:
            timestamp.append(np.nan)
        elif len(i.find('time')['datetime'])==19:
            timestamp.append(i.find('time')['datetime'])
        elif len(i.find('time')['datetime'])==24:
            timestamp.append(i.find('time')['datetime'].replace('.000Z',''))
        
        #retweets
        if i.find('div',{'data-testid':'retweet'})==None:
            retweet.append(0)
        else:
            retweet.append(i.find('div',{'data-testid':'retweet'}).get_text()) #retweet
        
        #likes
        if i.find('div',{'data-testid':'like'})==None:
            like.append(0)
        else:
            like.append(i.find('div',{'data-testid':'like'}).get_text())#like
        
        #replies
        if i.find('div',{'data-testid':'reply'})==None:
            reply.append(0)
        else:
            reply.append(i.find('div',{'data-testid':'reply'}).get_text()) #reply
    
    #making dataframe of all tweet info
    df=pd.DataFrame({'author':author,'username':username,'timestamp':timestamp,'post':post,'reply':reply,'like':like,'retweet':retweet})
    
    #some tweets don't have necessary tag for tweet stats so it returns blanks, replacing them with 0
    df['reply']=df['reply'].replace('',0)
    df['like']=df['like'].replace('',0)
    df['retweet']=df['retweet'].replace('',0)
    
    #drop duplicate tweets which may be collected repeatedly due to overlaps in page info while scrolling
    df=df.drop_duplicates(subset=['author', 'username', 'timestamp', 'post', 'reply', 'like', 'retweet'],keep='first')
    
    #making sure indices are in order
    df=df.reset_index()
    df=df.drop(columns='index')
    
    return df

## 2. Run queries

In [3]:
sample_url = twit_url('covid19ph','2020-03-16','2020-03-20', 'Manila', '150mi')
sample_soup = manual_crawl_page(sample_url,50)

twitsrc_data(sample_soup)

Unnamed: 0,author,username,timestamp,post,reply,like,retweet
0,Paul Argamosa,paulsalarm,2020-03-19T23:18:05,"Pag mayaman ka, #lockdown. Malas pag mahirap k...",0,1,0
1,"G O R Y , M D #DuterteResign",DocGorz,2020-03-19T23:16:53,In other words wag ninyong Galingan!\n\nHayop ...,0,8,2
2,Dave Llorito,Rainwalker19,2020-03-19T22:40:44,Private sector delivers. #COVID19PH #FightCOVI...,0,0,0
3,Dave Llorito,Rainwalker19,2020-03-19T22:32:04,Citizens deliver! #FightCOVID19 #covid19phQuot...,0,0,0
4,Dave Llorito,Rainwalker19,2020-03-19T22:28:56,Mayors deliver! #COVID19PH #FightCOVID19Quote ...,0,0,0
...,...,...,...,...,...,...,...
439,Lovely,lovelymaniri,2020-03-16T07:15:13,#TikTok #tiktokphilippines #tiktoker #MMQuaran...,0,0,0
440,Tess Caalim,caalim44240,2020-03-16T06:43:52,Replying to @SCPh_PIOThank you but i was able ...,0,0,0
441,CesH,Pinayironmom,2020-03-16T06:40:33,kala ng mga taga Cavite robot sila. Please sta...,0,2,0
442,Tess Caalim,caalim44240,2020-03-16T06:39:40,My staff reached the Hall of Justice QC with h...,0,0,0


In [4]:
url = twit_url('covid19ph','2020-03-16','2020-05-15', 'Manila', '150mi')
soup = manual_crawl_page(url, 1000)
data = twitsrc_data(soup)

In [5]:
data.shape

(2359, 7)

In [6]:
data.head()

Unnamed: 0,author,username,timestamp,post,reply,like,retweet
0,ImJustAStranger,AkoSiPrince08,2020-05-14T12:59:12,":::see you next year hopemed, waterleaf, infin...",1,0,0
1,Arcticket Visa Assistance and Travel Agency,arcticket,2020-05-14T09:42:22,World #COVID19 status May 14 2020.\n\n#Philipp...,0,0,0
2,Cha2xHeels,Cha2xH,2020-05-14T08:44:25,@BacoorCityGovt @DILGPhilippines @pcoogov #PHG...,0,0,0
3,Glenn Edward,GlennEdward19,2020-05-14T08:16:37,#WeHealAsOne\n#COVID2019 #COVID19PHHarvard and...,0,0,0
4,Randele Alcoran Arcilla,TheGreatOfRands,2020-05-14T07:14:03,Sadness went the family turns cancelled went t...,0,1,0


## 3. Clean scraped Twitter data

In [7]:
#remove irrelevant records
data=data[~data['username'].isnull()]
data=data[~data['timestamp'].isnull()]
data=data[data['username']!='no username']


#prettify indices
data=data.reset_index()
data=data.drop(columns='index')


#fixing timestamps and making new date and time columns
data['timestamp']=data['timestamp'].apply(str)
data['date']=pd.to_datetime(data['timestamp'].apply(lambda x: x.split('T')[0])).dt.date
data['time']=pd.to_datetime(data['timestamp'].apply(lambda x: x[11:])).dt.time
data['timestamp']=data['date'].apply(str) + " " + data['time'].apply(str)
data['timestamp']=pd.to_datetime(data['timestamp'], format='%Y-%m-%d %H:%M:%S')


#cleaning post content
data['post']=data['post'].str.replace('\n',' ')


#to fix likes and retweets, removes K-- which indicates thousand values
def kLikes(row):
    if row['like'] == None:
        row['like'] = '0'
    if 'K' in row['like']:
        digits = re.findall('\d+', row['like'])
        if len(digits) == 1:
            return digits[0] + '000'
        elif len(digits) == 2:
            return digits[0] + digits[1] +'00'
    else:
        return row['like']

def kRetweets(row):
    if row['retweet'] == None:
        row['retweet'] = '0'
    if 'K' in row['retweet']:
        digits = re.findall('\d+', row['retweet'])
        if len(digits) == 1:
            return digits[0] + '000'
        elif len(digits) == 2:
            return digits[0] + digits[1] +'00'
    else:
        return row['retweet']

data['like'] = data['like'].astype('str')
data['retweet'] = data['retweet'].astype('str')
data['like'] =data.apply(kLikes, axis = 1)
data['retweet'] = data.apply(kRetweets, axis = 1)
data['like'] = data['like'].astype('int64')
data['retweet'] = data['retweet'].astype('int64')

In [8]:
data.shape

(2357, 9)

In [9]:
data.head()

Unnamed: 0,author,username,timestamp,post,reply,like,retweet,date,time
0,ImJustAStranger,AkoSiPrince08,2020-05-14 12:59:12,":::see you next year hopemed, waterleaf, infin...",1,0,0,2020-05-14,12:59:12
1,Arcticket Visa Assistance and Travel Agency,arcticket,2020-05-14 09:42:22,World #COVID19 status May 14 2020. #Philippin...,0,0,0,2020-05-14,09:42:22
2,Cha2xHeels,Cha2xH,2020-05-14 08:44:25,@BacoorCityGovt @DILGPhilippines @pcoogov #PHG...,0,0,0,2020-05-14,08:44:25
3,Glenn Edward,GlennEdward19,2020-05-14 08:16:37,#WeHealAsOne #COVID2019 #COVID19PHHarvard and ...,0,0,0,2020-05-14,08:16:37
4,Randele Alcoran Arcilla,TheGreatOfRands,2020-05-14 07:14:03,Sadness went the family turns cancelled went t...,0,1,0,2020-05-14,07:14:03
