## **Function for scraping twitter handles of journalists based on a specific keyword**
Date 06/08/2020

In [1]:
# import libraries
# Note we need to check library version
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

In [None]:
def scrape_journalist_website(keyword):
  """
  This function scrapes a specific web page that contains information about journalists based in the UK
  who have a presence on Twitter. The fuction takes 1 argument which is the keyword that help us filter the journalists
  based on their field of expertise
  """
  # Set the url search=keyword
  url = r"https://www.journalism.co.uk/prof/?search=" + keyword + "&chunk=0&cmd=default"
  html = urlopen(url)

  # create two empty lists to store results
  profile_handles_page = []
  twitter_profile_names = []
  
  # create the loop that goes through the number of pages
  check = False
  i = 0
  while not check:
    # Define specific page url
    page_url = r"https://www.journalism.co.uk/prof/?search=" + keyword + "&chunk=" + str(i) + "&cmd=default"
    html = urlopen(page_url)

    # Parse the webpage 
    soup = BeautifulSoup(html, 'html.parser')
    
    # Create an empty list and fill it with a specific sub set of the web page that contains journalists' information
    temp_content_list = []
    for x in soup.find_all('div', class_="holder"):
      temp_content_list.append(x)
      
      # Find within the subsett all twitter handles
      profile_handles_page = re.findall(r'@(\w+)', str(temp_content_list))

    # Each keyword will produce different number of journalists (i.e. differnt number of pages)
    # Since we are not sure how many pages we need to scrap (if we set a high fixed number beforehand, we are redirected to the last page of the search)
    # So to avoid repetions of twitter handles we check if any of the twittr handles we are appending is already present in twitter_profile_names
    # if it is we break the loop and return our list as a Pandas dataframe
    check =  any(item in profile_handles_page for item in twitter_profile_names)
    if check is False:
      for twitter_handle in profile_handles_page:
        twitter_profile_names.append(twitter_handle)

    i += 1

  
  return pd.DataFrame(twitter_profile_names, columns = ['twitter_handle'])
