# Project Aqueduct
Scraping the LinkedIn Skill Directory at www.linkedin.com/directory/topic-a/

## 1. Preamble
Import source code and packages

In [1]:
#import dataScrape as ds

import requests
import re
import string
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

def collectSoup(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    return soup

## 2. Prepare parent/target URLs

In [2]:
parentURL = 'https://www.linkedin.com/directory/topics-'
suffix = ['more', *list(string.ascii_lowercase)]

print (suffix)

['more', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


## 3. Setup and run Selenium

Login to LinkedIn via ChromeDriver.

In [3]:
from selenium import webdriver

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--incognito")

driver = webdriver.Chrome('./localfiles/chromedriver',options=chrome_options)

driver.get('https://www.linkedin.com')

username = driver.find_element_by_id('login-email')
username.send_keys('XXXXX')

password = driver.find_element_by_id('login-password')
password.send_keys('XXXXX')

log_in_button = driver.find_element_by_id('login-submit')
log_in_button.click()


## 4. Run the loop across all skill directories

Loop through each subdirectory (alphabetic) and subsubdirectory (subalphabetic) to extract the skill name and URL. Most popular skills in each subdirectory are also captured.

If subdirectory is y or z, treat as subsubdirectory. Otherwise, for each subdirectory:

* visit the page
* extract the html
* extract the popular skills subtree and collect skills
* extract the list of subsubdirectories

For each subsubdirectory

* visit the page
* extract the html
* extract the content tags
* extract the skill name and url


In [5]:
allskills = [] #container for skills and URLS
allpopskills = [] #container for popular skills

for suff in suffix:
    
    if (suff == 'y' | suff == 'z'):
        
        driver.get(parentURL+suff)
        soup2 = BeautifulSoup(driver.page_source,'html.parser')

        contenttags = soup2.find_all(class_="content")
        acontenttags = [tag.find("a") for tag in contenttags]
        skills = [[i.string,i['href']] for i in acontenttags]

        allskills = [*allskills, *skills]
        
    else:
    
        driver.get(parentURL+suff)
        soup1 = BeautifulSoup(driver.page_source,'html.parser')

        cqc = soup1.find_all(class_="column quad-column")

        tags0 = cqc[0].find_all("a")

        popskills = [str(i.string) for i in tags0]
        allpopskills = [*allpopskills, *popskills]

        tags1 = cqc[1].find_all("a")

        suburls = [str(i['href']) for i in tags1]

        for suburl in suburls:

            driver.get(suburl)
            soup2 = BeautifulSoup(driver.page_source,'html.parser')

            contenttags = soup2.find_all(class_="content")
            acontenttags = [tag.find("a") for tag in contenttags]
            skills = [[i.string,i['href']] for i in acontenttags]

            allskills = [*allskills, *skills]


[['A330', 'https://www.linkedin.com/topic/a330'], ['AAAHC', 'https://www.linkedin.com/topic/aaahc'], ['AACR2', 'https://www.linkedin.com/topic/aacr2'], ['AAMS', 'https://www.linkedin.com/topic/aams'], ['AAP', 'https://www.linkedin.com/topic/aap'], ['AAR', 'https://www.linkedin.com/topic/aar'], ['AASHTO', 'https://www.linkedin.com/topic/aashto'], ['AAUS Scientific Diver', 'https://www.linkedin.com/topic/aaus-scientific-diver'], ['ABAP Web Dynpro', 'https://www.linkedin.com/topic/abap-web-dynpro'], ['ABAP-OO', 'https://www.linkedin.com/topic/abap-oo'], ['ABB 800xA', 'https://www.linkedin.com/topic/abb-800xa'], ['ABBYY', 'https://www.linkedin.com/topic/abbyy'], ['ABBYY FineReader', 'https://www.linkedin.com/topic/abbyy-finereader'], ['ABC Analysis', 'https://www.linkedin.com/topic/abc-analysis'], ['ABC Flowcharter', 'https://www.linkedin.com/topic/abc-flowcharter'], ['ABCP', 'https://www.linkedin.com/topic/abcp'], ['ABEL', 'https://www.linkedin.com/topic/abel'], ['ABI', 'https://www.linke

In [9]:
df = pd.DataFrame(allskills,columns=["Skill","URL"])

df

Unnamed: 0,Skill,URL
0,A330,https://www.linkedin.com/topic/a330
1,AAAHC,https://www.linkedin.com/topic/aaahc
2,AACR2,https://www.linkedin.com/topic/aacr2
3,AAMS,https://www.linkedin.com/topic/aams
4,AAP,https://www.linkedin.com/topic/aap
5,AAR,https://www.linkedin.com/topic/aar
6,AASHTO,https://www.linkedin.com/topic/aashto
7,AAUS Scientific Diver,https://www.linkedin.com/topic/aaus-scientific...
8,ABAP Web Dynpro,https://www.linkedin.com/topic/abap-web-dynpro
9,ABAP-OO,https://www.linkedin.com/topic/abap-oo


In [10]:
df.to_csv('AllSkills.csv')