# Data Acquisition

In [7]:
import requests 
import pandas as pd 
import re 
import os 

from bs4 import BeautifulSoup 
from selenium import webdriver 

## Part 1: Utilizing Selenium

### Disclaimer: the version for the driver might need to be modifed and adjusted for the specific local machine.
- The project can still be replicated solely in the Data_Analysis.ipynb notebook because the finalized dataset is in the repository.

### Accounting

In [None]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# web driver set up
driver = webdriver.Chrome()
driver.get("https://info.lse.ac.uk/Staff/Departments-and-Institutes")

# Click the department
department = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'Department of Accounting')))
driver.execute_script("arguments[0].scrollIntoView();", department)
department.click()

# Click "People"
people = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'People')))
people.click()
people_url = driver.current_url

# Click "Academic Faculty"
academic_faculty = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'Academic Faculty')))
driver.execute_script("arguments[0].scrollIntoView();", academic_faculty)
academic_faculty.click()

In [None]:
url = driver.current_url
r = requests.get(url)
soup = BeautifulSoup(r.content,'lxml')
academic_faculty = soup.find("div", attrs={'class': "accordion__content"})
academic_faculty
text_block = academic_faculty.find_all("div", attrs={'class': "accordion__txt"})

professors = []
for professor in text_block:
    prof_names = professor.find("a", attrs={'class': "sys_0 sys_t0"})
    professors.append(prof_names)

In [None]:
professor_urls = []
for professor in professors:
    url = professor.get("href")
    url = "https://www.lse.ac.uk" + url
    if url not in professor_urls:
        professor_urls.append(url)

In [None]:
professors_dict = {}
professor_name_list = []
professor_prefix_list = []
key_expertise_list = []
professor_title_list = []
languages_list = []
title_list = []
modules_list = []

for one_url in professor_urls:
    
    r_2 = requests.get(one_url)
    soup_2 = BeautifulSoup(r_2.content,'lxml')

    # extract the prof name and prefix
    professor = soup_2.find("h1", attrs={'class': 'people__name'})
    professor_prefix = professor.find('span', class_='people__title').text
    professor_name = professor.text.strip("Dr")
    professor_name = professor.text.strip("Professor")
    professor_name_list.append(professor_name)
    professor_prefix_list.append(professor_prefix)


    # extract the key expertise and append them
    key_expertise_locate = soup_2.find('div', class_='peopleContact__method', text="Key Expertise")
    if key_expertise_locate: # if key_expertise_locate exists and doesn't return None
        key_expertise = key_expertise_locate.find_next_sibling('div').text
        key_expertise_list.append(key_expertise)
    else:
        key_expertise_list.append(None)

    # extract the languages and append them
    languages_locate = soup_2.find('div', class_='peopleContact__method', text="Languages")
    if languages_locate:
        languages = languages_locate.find_next_sibling('div').text
        languages_list.append(languages)
    else:
        languages_list.append(None)

    # Add professor title
    professor_title = soup_2.find('h2', class_='people__position').text
    professor_title_list.append(professor_title)

    # Add courses that the professor is teaching
    teaching = soup_2.find(name=["h3","h2", "p"], text= ["Teaching", "Teaching:"])
    if teaching:
        module_list = teaching.find_next('ul')
        modules = [li.text for li in module_list.find_all('li')] # extract the items from the <li> elements
        modules = [module.replace('\xa0', " ").strip("\n") for module in modules]
        modules_list.append(modules)
    else:
        modules_list.append(None)

professors_dict["Professor Name"] = professor_name_list
professors_dict["Professor Prefix"] = professor_prefix_list
professors_dict["Key Expertise"] = key_expertise_list
professors_dict["Languages"] = languages_list
professors_dict["Title"] = professor_title_list
professors_dict["Modules"] = modules_list

import pandas as pd
professors_df = pd.DataFrame(professors_dict)

### Mathematics Department

In [None]:
# Set up the web driver
driver = webdriver.Chrome()
driver.get("https://info.lse.ac.uk/Staff/Departments-and-Institutes")

# Find and click the department
department = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'Department of Mathematics')))
driver.execute_script("arguments[0].scrollIntoView();", department)
department.click()

# Click "People" section
people = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'People')))
people.click()
people_url = driver.current_url

# Click the "Academic Faculty" area
academic_faculty = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'Academic Faculty')))
driver.execute_script("arguments[0].scrollIntoView();", academic_faculty)
academic_faculty.click()

In [None]:
url = driver.current_url
r = requests.get(url)
soup = BeautifulSoup(r.content,'lxml')
academic_faculty = soup.find("div", attrs={'class': "accordion__content"})
img_text = academic_faculty.find_all("div", attrs={'class': "accordion__imgTxt"})
professors = []
#print(img_text)
for i in img_text:
    one_text_block = i.find("div",attrs={'class': "accordion__txt"})
    one_prof = one_text_block.find("a",attrs={"class": "sys_16"})
    another_prof = one_text_block.find("a",attrs={"class": "sys_0 sys_t0"})
    #if not one_prof:
     #   one_prof = one_text_block.find("a",attrs={"class": "sys_0 sys_t0"})
    professors.append(one_prof)
    professors.append(another_prof)

professors = list(filter(None, professors))
filtered_professors = []
for professor in professors:
    if professor['href'].startswith('http://www.lse.ac.uk') or professor['href'].startswith('/Mathematics'):
        filtered_professors.append(professor)

In [None]:
professor_urls = []
for professor in filtered_professors:
    url = professor.get("href")
    if  url.startswith('/Mathematics'):
        url = "http://www.lse.ac.uk" + url
    if url not in professor_urls:
        professor_urls.append(url)

In [None]:
professors_dict = {}
professor_name_list = []
professor_prefix_list = []
key_expertise_list = []
professor_title_list = []
languages_list = []
title_list = []
modules_list = []

for one_url in professor_urls:
    
    r_2 = requests.get(one_url)
    soup_2 = BeautifulSoup(r_2.content,'lxml')
    # extract the prof name and prefix    
    professor = soup_2.find("h1", attrs={'class': 'people__name'})
    professor_prefix = professor.find('span', class_='people__title').text
    professor_name = professor.text.strip("Dr")
    professor_name = professor.text.strip("Professor")
    professor_name_list.append(professor_name)
    professor_prefix_list.append(professor_prefix)

    # extract the key expertise and append them        
    key_expertise_locate = soup_2.find('div', class_='peopleContact__method', text="Key Expertise")
    if key_expertise_locate: # if key_expertise_locate exists and doesn't return None
        key_expertise = key_expertise_locate.find_next_sibling('div').text
        key_expertise_list.append(key_expertise)
    else:
        key_expertise_list.append(None)

    # extract the languages and append them    
    languages_locate = soup_2.find('div', class_='peopleContact__method', text="Languages")
    if languages_locate:
        languages = languages_locate.find_next_sibling('div').text
        languages_list.append(languages)
    else:
        languages_list.append(None)

    # Add professor title
    professor_title = soup_2.find('h2', class_='people__position').text
    professor_title_list.append(professor_title)

    # Add courses that the professor is teaching
    teaching = soup_2.find(name=["h3","h2", "p"], text= ["Teaching", "Teaching:"])
    if teaching:
        module_list = teaching.find_next('ul')
        modules = [li.text for li in module_list.find_all('li')] # extract the items from the <li> elements
        modules = [module.replace('\xa0', " ").strip("\n") for module in modules]
        modules_list.append(modules)
    else:
        modules_list.append(None)

professors_dict["Professor Name"] = professor_name_list
professors_dict["Professor Prefix"] = professor_prefix_list
professors_dict["Key Expertise"] = key_expertise_list
professors_dict["Languages"] = languages_list
professors_dict["Title"] = professor_title_list
professors_dict["Modules"] = modules_list

professors_df = pd.DataFrame(professors_dict)

### Finance Department

In [None]:
driver = webdriver.Chrome()
driver.get("https://info.lse.ac.uk/Staff/Departments-and-Institutes")

# Identify and click the department
department = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'Department of Finance')))
driver.execute_script("arguments[0].scrollIntoView();", department)
department.click()

# Click the People
people = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'People')))
people.click()
people_url = driver.current_url

# Find and then click "Academic Faculty"
academic_faculty = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'Finance faculty')))
driver.execute_script("arguments[0].scrollIntoView();", academic_faculty)
academic_faculty.click()

In [None]:
url = driver.current_url
r = requests.get(url)
soup = BeautifulSoup(r.content,'lxml')
academic_faculty = soup.find("div", attrs={'class': "accordion__content"})
academic_faculty
text_block = academic_faculty.find_all("div", attrs={'class': "accordion__txt"})

professors = []
for professor in text_block:
    prof_names = professor.find("a", attrs={'class': "sys_0 sys_t0"})
    professors.append(prof_names)

In [None]:
professor_urls = []
for professor in professors:
    url = professor.get("href")
    url = "https://www.lse.ac.uk" + url
    if url not in professor_urls:
        professor_urls.append(url)

In [None]:
professors_dict = {}
professor_name_list = []
professor_prefix_list = []
key_expertise_list = []
professor_title_list = []
languages_list = []
title_list = []
modules_list = []

for one_url in professor_urls:
    
    r_2 = requests.get(one_url)
    soup_2 = BeautifulSoup(r_2.content,'lxml')

    # extract the prof name and prefix    
    professor = soup_2.find("h1", attrs={'class': 'people__name'})
    professor_prefix = professor.find('span', class_='people__title').text
    professor_name = professor.text.strip("Dr")
    professor_name = professor.text.strip("Professor")
    professor_name_list.append(professor_name)
    professor_prefix_list.append(professor_prefix)

    # extract the key expertise and append them        
    key_expertise_locate = soup_2.find('h2', text="Research Interests")
    if key_expertise_locate: # if key_expertise_locate exists and doesn't return None
        key_expertise = key_expertise_locate.find_next_sibling('p')
        text = key_expertise.get_text(separator='\n')
        key_expertise_inner_list = text.strip().split('\n')
        key_expertise_list.append(key_expertise_inner_list)
    else:
        key_expertise_list.append(None)

    # extract the languages and append them        
    languages_locate = soup_2.find('div', class_='peopleContact__method', text="Languages")
    if languages_locate:
        languages = languages_locate.find_next_sibling('div').text
        languages_list.append(languages)
    else:
        languages_list.append(None)

    
    # Adding professor title  
    professor_title = soup_2.find('h2', class_='people__position').text
    professor_title_list.append(professor_title)

    # Add courses / modules that the professor is teaching 
    teaching = soup_2.find(name=["h3","h2", "p"], text= ["Teaching", "Teaching:"])
    if teaching:
        module_list = teaching.find_next('ul')
        modules = [li.text for li in module_list.find_all('li')] # extract the items from the <li> elements
        modules = [module.replace('\xa0', " ").strip("\n") for module in modules]
        modules_list.append(modules)
    else:
        modules_list.append(None)

professors_dict["Professor Name"] = professor_name_list
professors_dict["Professor Prefix"] = professor_prefix_list
professors_dict["Key Expertise"] = key_expertise_list
professors_dict["Languages"] = languages_list
professors_dict["Title"] = professor_title_list
professors_dict["Modules"] = modules_list

professors_df = pd.DataFrame(professors_dict)

### Statistics Department

In [None]:
# Setting up the web driver
driver = webdriver.Chrome()
driver.get("https://info.lse.ac.uk/Staff/Departments-and-Institutes")

# identify and click the department
department = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'Department of Statistics')))
driver.execute_script("arguments[0].scrollIntoView();", department)
department.click()

# identify and click "People"
people = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'People')))
people.click()
people_url = driver.current_url

# identify and click "Academic Faculty"
academic_faculty = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'Academic faculty')))
driver.execute_script("arguments[0].scrollIntoView();", academic_faculty)
academic_faculty.click()

In [None]:
url = driver.current_url
r = requests.get(url)
soup = BeautifulSoup(r.content,'lxml')
academic_faculty = soup.find("div", attrs={'class': "accordion__content"})
academic_faculty
text_block = academic_faculty.find_all("div", attrs={'class': "accordion__txt"})

professors = []
for professor in text_block:
    prof_names = professor.find("a", attrs={'class': "sys_0 sys_t0"})
    professors.append(prof_names)

In [None]:
professor_urls = []
for professor in professors:
    url = professor.get("href")
    url = "https://www.lse.ac.uk" + url
    if url not in professor_urls:
        professor_urls.append(url)

In [None]:
professors_dict = {} 
professor_name_list = [] 
professor_prefix_list = [] 
key_expertise_list = [] 
professor_title_list = [] 
languages_list = [] 
title_list = [] 
modules_list = [] 

for one_url in professor_urls: 
    
    r_2 = requests.get(one_url) 
    soup_2 = BeautifulSoup(r_2.content,'lxml') 
 
    # Get professor prefix and name 
    professor = soup_2.find("h1", attrs={'class': 'people__name'}) 
    professor_prefix = professor.find('span', class_='people__title').text 
    professor_name = professor.text.strip("Dr") 
    professor_name = professor.text.strip("Professor") 
    professor_name_list.append(professor_name) 
    professor_prefix_list.append(professor_prefix) 

    # Get the key expertise 
    key_expertise_locate = soup_2.find('div', class_='peopleContact__method', text="Key Expertise") 
    if key_expertise_locate: # if key_expertise_locate exists and doesn't return None 
        key_expertise = key_expertise_locate.find_next_sibling('div').text 
        key_expertise_list.append(key_expertise) 
    else:
        key_expertise_list.append(None) 

    # Get the languages 
    languages_locate = soup_2.find('div', class_='peopleContact__method', text="Languages") 
    if languages_locate: 
        languages = languages_locate.find_next_sibling('div').text 
        languages_list.append(languages) 
    else: 
        languages_list.append(None) 

    
    # Get professor title 
    professor_title = soup_2.find('h2', class_='people__position').text 
    professor_title_list.append(professor_title) 

    # Get courses that the professor is teaching 
    pattern = r"ST\d{3}"
    text = soup_2.find('div', class_='people__bio').text
    modules = re.findall(pattern, text)
    modules_list.append(modules)
    
professors_dict["Professor Name"] = professor_name_list
professors_dict["Professor Prefix"] = professor_prefix_list
professors_dict["Key Expertise"] = key_expertise_list
professors_dict["Languages"] = languages_list
professors_dict["Title"] = professor_title_list
professors_dict["Modules"] = modules_list

# Part 2: No Selenium and just BeautifulSoup

### Economics Department

In [13]:
ec_link = 'https://www.lse.ac.uk/economics/people/faculty'

response_html = requests.get(ec_link)
main_soup = BeautifulSoup(response_html.text)


module_codes = []
ec_staff = []
names = []
titles = []
key_exp = []
langs = []
mods = []

for i in range(0, 70):
    element = main_soup.find_all('a', {'class': 'sys_0 sys_t0'})[i]
    href = element.get('href')
    link = 'https://www.lse.ac.uk/' + href
    ec_staff.append(link)

for staff_link in ec_staff:
    response_html = requests.get(staff_link)
    soup = BeautifulSoup(response_html.text)

    # Name
    name = soup.find('h1').get_text()
    names.append(name)

    # Title
    title = soup.find('h2').get_text()
    titles.append(title)

    # Key Expertise
    key = soup.find_all('div', attrs={'class': 'peopleContact__address'})[-1].get_text()
    key_exp.append(key)

    # Languages
    lan = soup.find_all('div', attrs={'class': 'peopleContact__address'})[-2].get_text()
    langs.append(lan)

    # Modules
    try:

        mod1 = soup.find_all('div', attrs={'class': 'people__bio'})[0].find_all('p')[0]
        mod_txt = mod1.get_text()
        mod_txt = mod_txt.replace('\xa0', ' ')
        if ('Teaching' in mod_txt) or mod_txt.startswith('Teaching'):

            module_codes = re.findall(r"(([A-Z]{2}\d{3}): .+?(?=EC\d{3}|$))", mod_txt)
            module_code = [f"{code}" for code, title in module_codes]
            mods.append(module_code)

        else:
            try:
                mod1 = soup.find_all('div', attrs={'class': 'people__bio'})[0].find_all('p')[1]
                mod_txt = mod1.get_text()
                mod_txt = mod_txt.replace('\xa0', ' ')
                if ('Teaching' in mod_txt) or mod_txt.startswith('Teaching'):
                    module_codes = re.findall(r"(([A-Z]{2}\d{3}): .+?(?=[A-Z]{2}\d{3}|$))", mod_txt)
                    module_code = [f"{code}" for code, title in module_codes]
                    mods.append(module_code)

                else:
                    try:
                        mod1 = soup.find_all('div', attrs={'class': 'people__bio'})[0].find_all('p')[2]
                        mod_txt = mod1.get_text()
                        mod_txt = mod_txt.replace('\xa0', ' ')
                        if ('Teaching' in mod_txt) or mod_txt.startswith('Teaching'):
                            module_codes = re.findall(r"(([A-Z]{2}\d{3}): .+?(?=[A-Z]{2}\d{3}|$))", mod_txt)
                            module_code = [f"{code}" for code, title in module_codes]
                            mods.append(module_code)
                            
                        else:
                            try:
                                mod1 = soup.find_all('div', attrs={'class': 'people__bio'})[0].find_all('p')[3]
                                mod_txt = mod1.get_text()
                                mod_txt = mod_txt.replace('\xa0', ' ')
                                if ('Teaching' in mod_txt) or mod_txt.startswith('Teaching'):
                                    module_codes = re.findall(r"(([A-Z]{2}\d{3}): .+?(?=[A-Z]{2}\d{3}|$))", mod_txt)
                                    module_code = [f"{code}" for code, title in module_codes]
                                    mods.append(module_code)
                                else:
                                    mods.append('NA')
                                    
                            except IndexError:
                                mods.append('NA')
                                print("1",staff_link,mod_txt)
                                continue

                    except IndexError:
                        mods.append('NA')
                        print("2",staff_link,mod_txt)
                        continue

            except IndexError:
                mods.append('NA')
                print("3",staff_link,mod_txt)
                continue

    except IndexError: # Thesea re to account for IndexErrors and to make sure we navigate to the right block
        mods.append('NA')
        print("4",staff_link)
        continue

1 https://www.lse.ac.uk//economics/people/faculty/keyu-jin WebpagesPersonal  |  LSE Experts  |  CFM
1 https://www.lse.ac.uk//economics/people/faculty/jonathan-leape WebpagesPersonal  |  LSE Experts  |  IGC
1 https://www.lse.ac.uk//economics/people/faculty/john-moore WebpagesLSE Experts  |  STICERD
2 https://www.lse.ac.uk//economics/people/faculty/junius-olivier WebpagesPersonal
3 https://www.lse.ac.uk//economics/people/faculty/nicolo-rosetti EducationPhD in Economics, Kyoto University
1 https://www.lse.ac.uk//economics/people/faculty/thomas-sampson WebpagesPersonal  |  CEP  |  SPP
1 https://www.lse.ac.uk//economics/people/faculty/judith-shapiro WebpagesPersonal
1 https://www.lse.ac.uk//economics/people/faculty/johannes-spinnewijn WebpagesPersonal  |  CEP  |  STICERD  |  SPP
1 https://www.lse.ac.uk//economics/people/faculty/silvana-tenreyro WebpagesPersonal  |  CFM  |  IGC


In [14]:
mods[4] = ['EC201: Microeconomic Principles I','PP450 Public Organisations: Theory and Practice']
mods[12] = ['EC417 Advanced Macroeconomics','EC539 Macroeconomics for Research Students']
mods[19] = ['EC402: Econometrics','EC443: Econometrics for MRes Students']
mods[20] = ['EC1B1 Macroeconomics I','EC539 Macroeconomics for Research Students']
mods[26] = ['EC202 Microeconomic Principles II','EC230 Economics in Public Policy']
mods[35] = ['EC1B3 Macroeconomics l']

In [15]:
result = pd.DataFrame()
data_dict = {
    'Name': names,
    'Title': titles,
    'Key Expertise': key_exp,
    'Languages': langs,
    'Modules': mods
}

df = pd.DataFrame.from_dict(data_dict)

econ_df = pd.concat([result, df], ignore_index=True)

### Methodology Department

In [None]:
my_link = "https://www.lse.ac.uk/Methodology/People"
response_html = requests.get(my_link)
soup = BeautifulSoup(response_html.text)

staff_links = []

all_links = soup.find_all("div", attrs={"class":"accordion__panel"})[0].find_all("a", attrs={"class":"sys_0 sys_t0"})

for person in all_links:
    link = "https://www.lse.ac.uk" + person.get("href")    
    staff_links.append(link)

names = []
titles = []
langs = []
key_exp = []
mods = []

# iterating through all staff members' pages
for link in staff_links:
    
    response_html = requests.get(link)
    
    soup = BeautifulSoup(response_html.text)
    
    # Name
    name = soup.find_all('h1')[0].get_text()
    names.append(name)
    
    # Title
    title1 = soup.find_all('h1')[0].get_text()
    titles.append(title1.split()[0])
    
    # Languages
    lang = soup.find_all('div',attrs={'class':'peopleContact__address'})[-2].get_text()
    langs.append(lang)
    
    # Key Expertise
    exp = soup.find_all("div", attrs={"class":"peopleContact__address"})[-1].get_text()
    key_exp.append(exp)
    
    mods.append('NA')
    

In [None]:
result = pd.DataFrame()
data_dict = {
    'Name': names,
    'Title': titles,
    'Key Expertise': key_exp,
    'Languages': langs,
    'Modules': mods
}

df = pd.DataFrame.from_dict(data_dict)

my_df = pd.concat([result, df], ignore_index=True)
my_df

### Management Department

In [None]:
my_link = "https://www.lse.ac.uk/management/people-home"
response_html = requests.get(my_link)

soup = BeautifulSoup(response_html.text)

staff_links = []

for i in range(0,6):

    all_links = soup.find_all("div", attrs={"class":"accordion__panel"})[i].find_all("a", attrs={"class":"sys_0 sys_t0"})

    for person in all_links:
        link = "https://www.lse.ac.uk" + person.get("href")    
        staff_links.append(link)

names = []
titles = []
langs = []
key_exp = []
mods = []        
        
for person in staff_links:        
    response_html = requests.get(person)
    
    soup = BeautifulSoup(response_html.text)
    
    # Name
    name = soup.find_all('h1')[0].get_text()
    names.append(name)
    
    # Title
    title1 = soup.find_all('h1')[0].get_text()
    titles.append(title1.split()[0])
    
    # Languages
    lang = soup.find_all('div',attrs={'class':'peopleContact__address'})[-2].get_text()
    langs.append(lang)
    
    # Key Expertise
    exp = soup.find_all("div", attrs={"class":"peopleContact__address"})[-1].get_text()
    key_exp.append(exp)
    
    mods.append('NA')

In [None]:
result = pd.DataFrame()
data_dict = {
    'Name': names,
    'Title': titles,
    'Key Expertise': key_exp,
    'Languages': langs,
    'Modules': mods
}

df = pd.DataFrame.from_dict(data_dict)

mg_df = pd.concat([result, df], ignore_index=True)
mg_df

### Data Science Institute

In [8]:
my_link = "https://www.lse.ac.uk/DSI/People"
response_html = requests.get(my_link)

soup = BeautifulSoup(response_html.text)

staff_links = []
for person in soup.find_all("div", attrs={"class":"accordion__content"})[0].find_all("a", attrs={"class":"sys_0 sys_t0"}):
    link = "https://www.lse.ac.uk" + person.get("href")    
    staff_links.append(link)
    
names = []
titles = []
langs = []
key_exp = []
mods = []

for person in staff_links:        
    response_html = requests.get(person)
    
    soup = BeautifulSoup(response_html.text)
    
    # Name
    name = soup.find_all('h1')[0].get_text()
    names.append(name)
    
    # Title
    title1 = soup.find_all('h1')[0].get_text()
    titles.append(title1.split()[0])
    
    # Languages
    lang = soup.find_all('div',attrs={'class':'peopleContact__address'})[-2].get_text()
    langs.append(lang)
    
    # Key Expertise
    exp = soup.find_all("div", attrs={"class":"peopleContact__address"})[-1].get_text()
    key_exp.append(exp)
    
    mods.append('NA')

# adding the courses manually based on the website
mods[2] = ['DS101','DS101','DS202']    

In [None]:
result = pd.DataFrame()
data_dict = {
    'Name': names,
    'Title': titles,
    'Key Expertise': key_exp,
    'Languages': langs,
    'Modules': mods
}

df = pd.DataFrame.from_dict(data_dict)

dsi_df = pd.concat([result, df], ignore_index=True)
dsi_df

## Exporting

In [None]:
file_path = os.path.join('../data','accounting.csv'')
professors_df.to_csv(file_path)

file_path = os.path.join('../data','maths.csv'')
professors_df.to_csv(file_path)

file_path = os.path.join('../data','finance.csv'')
professors_df.to_csv(file_path)

file_path = os.path.join('../data','stats.csv'')
professors_df.to_csv(file_path)
                         
file_path = os.path.join('../data','economics.csv')
econ_df.to_csv(file_path)  

file_path = os.path.join('../data', 'methodology.csv')
my_df.to_csv(file_path)      

file_path = os.path.join('../data', 'management.csv')
mg_df.to_csv(file_path) 
                         
file_path = os.path.join('../data', 'datascienceinstitute.csv')
dsi_df.to_csv(file_path)                         