In [1]:
# Import basic and advanced modules
from IPython.core.display import clear_output
import pandas as pd
import numpy as np
import aiohttp
import asyncio
import time
import datetime
from bs4 import BeautifulSoup
import requests
import nest_asyncio

In [2]:
## Data Analyst jobs
nest_asyncio.apply()
today = datetime.datetime.today().strftime('%d_%b')
def scrape(url):
    start = time.time()
    
    # Initialize variables to be extracted
    link = []
    title = []
    body = []
    
    # Scrape 540 data analyst jobs
    stop_page = 20
    
    
    # urls parsing
    async def fetch(session, url):
        async with session.get(url) as response:
            return await response.text()
        
    async def main():
        async with aiohttp.ClientSession() as session:
            for page in range(1,stop_page+1):
                html = await fetch(session, url + f'{page}')
                soup = BeautifulSoup(html,'html.parser')
                for lnk in soup.find_all('h3',class_="title"):
                    link.append(str('https://www.reed.co.uk')+lnk.find('a').get('href'))
                    
    asyncio.run(main())
    
                    
    # Information parsing
    async def fetch(session, url):
        async with session.get(url) as response:
            return await response.text()
        
    async def main():
        async with aiohttp.ClientSession() as session:
            # Count the no of requests
            req = 0
            for lnk,req_count in zip(link,range(1,len(link)+1)):
                html = await fetch(session, lnk)
                req = req+1
                print(f'Requests Completed: {req} out of {len(link)}')
                soup = BeautifulSoup(html,'html.parser')
                # Clear all the outputs except the current one in notebook console
                clear_output(wait=True)
                
                # Extract title
                title_tag = soup.find('h1')
                title.append(title_tag.text if title_tag is not None else 'missing')
                
                # Extract body text
                body_tag = soup.find('div',class_='col-xs-12 col-sm-8 col-md-9')
                body.append(body_tag.text.strip() if body_tag is not None else 'missing')
                
    asyncio.run(main())
    
    #Create a df of extracted variables
    df = pd.DataFrame({'title':title,'link':link, 'body':body})
    duration = np.round((time.time()-start)/60,2)
    
    print(f'{len(df)} Animal Jobs: Time required to scrape {len(df)} observation: {duration} minutes')
    return df.to_csv(f'{today}_data_analyst_jobs.csv',index=False)

## Call the function
scrape('https://www.reed.co.uk/jobs/data-analyst-jobs?pageno=')

540 Animal Jobs: Time required to scrape 540 observation: 6.37 minutes


In [10]:
#Read the scraped csv
df = pd.read_csv('30_Jan_data_analyst_jobs.csv')
df.head()

Unnamed: 0,title,link,body
0,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/38326...,missing
1,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/39684...,Salary negotiable\n\n\n\n\n\n\n\n\n\nBexhill-o...
2,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/39836...,"£35,000 - £40,000 per annum\n\n\n\n\n\n\n\n\n\..."
3,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/39673...,Salary negotiable\n\n\n\n\n\n\n\n\n\nAndover\n...
4,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/37799...,"£30,000 - £50,000 per annum\n\n\n\n\n\n\n\n\n\..."


In [11]:
# Count missing rows
df[df.body=='missing'].count()

title    19
link     19
body     19
dtype: int64

In [12]:
# Drop missing rows
to_drop = df[df.body=='missing'].index
df = df.drop(to_drop,axis=0).reset_index(drop=True)

In [13]:
# Remove new lines from body text
df.body = df.body.str.replace('\n','')

In [14]:
# Lets have a look after cleaning
df.head()

Unnamed: 0,title,link,body
0,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/39684...,"Salary negotiableBexhill-on-Sea, East SussexPe..."
1,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/39836...,"£35,000 - £40,000 per annumLondon, South East ..."
2,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/39673...,"Salary negotiableAndover, HampshireContract, f..."
3,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/37799...,"£30,000 - £50,000 per annumCentral London, Lon..."
4,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/37080...,"Competitive salaryLondon, South East EnglandPe..."


In [15]:
# Create columns indicating if a prticular job requires those columns
df['excel'] = np.where(df.body.str.contains('excel',case=False),'yes','no')
df['python'] = np.where(df.body.str.contains('python',case=False),'yes','no')
df['sql'] = np.where(df.body.str.contains('sql',case=False),'yes','no')
df['R'] = np.where(df.body.str.contains(' R '),'yes','no')

# Lets look at the data after feature engineering
df.head(10)

Unnamed: 0,title,link,body,excel,python,sql,R
0,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/39684...,"Salary negotiableBexhill-on-Sea, East SussexPe...",yes,yes,yes,no
1,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/39836...,"£35,000 - £40,000 per annumLondon, South East ...",yes,yes,yes,no
2,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/39673...,"Salary negotiableAndover, HampshireContract, f...",yes,no,no,no
3,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/37799...,"£30,000 - £50,000 per annumCentral London, Lon...",yes,yes,yes,no
4,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/37080...,"Competitive salaryLondon, South East EnglandPe...",yes,no,yes,no
5,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/39708...,"Salary negotiableChester, CheshirePermanent, f...",yes,no,yes,no
6,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/39744...,"£35,000 - £50,000 per annumGuildford, SurreyPe...",yes,yes,yes,no
7,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/39698...,"Salary negotiableNottinghamshire, East Midland...",yes,no,yes,no
8,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/39709...,"Competitive salarySlough, BerkshirePermanent, ...",yes,no,yes,no
9,Data Analyst,https://www.reed.co.uk/jobs/data-analyst/39770...,"£22,000 - £25,000 per annumCheltenham, Glouces...",no,no,no,no


In [16]:
# Finally create a csv file
df.to_csv('data_analyst_jobs_tools_check.csv',index=False)