# UCL MSc - Module Scrape
## Preamble
Import modules

In [1]:
import CompSciScrape as cs
import pandas as pd

## Scraping the Computer Science Module Directory
First, a database is made of all postgraduate modules listed in the Computer Science department's 2017/18 directory. Note, statistics courses were found not to be listed with detail on the UCL website. Only reading lists seem ubiquitous across modules from different departments.

In [2]:
# Specify URL location of module directory
parentURL = 'http://www.cs.ucl.ac.uk/'
targetURL = 'http://www.cs.ucl.ac.uk/current_students/syllabus/pg/'

# Collect soup
modules_soup = cs.collectSoup(targetURL)

# Find all module tags and parse modules into a pandas dataframe
modules_directory = cs.parseModuleList(modules_soup, parentURL)
modules_directory.head()

Unnamed: 0,Code,Module,Link
0,COMPG001,Financial Data and Statistics,http://www.cs.ucl.ac.uk/current_students/sylla...
1,COMPG004,Market Risk Measures and Portfolio Theory,http://www.cs.ucl.ac.uk/current_students/sylla...
2,COMPG005,Numerical Analysis for Finance,http://www.cs.ucl.ac.uk/current_students/sylla...
3,COMPG007,Operational Risk Measurement for Financial Ins...,http://www.cs.ucl.ac.uk/current_students/sylla...
4,COMPG008,Stochastic Processes for Finance,http://www.cs.ucl.ac.uk/current_students/sylla...


## Loading the Target Modules List
Second, a database of modules available on our target MSc courses is loaded.

In [3]:
# Specify file location of CSV
file = 'TargetModules.csv'

# Read CSV file to pandas dataframe
modules_targets = cs.loadCSV(file)
modules_targets.head()

Unnamed: 0,Module,CS & ML,ML,Type
0,Supervised Learning,True,True,Core
1,Statistical Modelling and Data Analysis,True,False,Core
2,Graphical Models,True,True,Group One
3,Probabilistic and Unsupervised Learning,True,True,Group One
4,Advanced Deep Learning and Reinforcement Learning,True,True,Group Two


## Identify Links
The directory and target lists are joined to identify the URLs containing the relevant module information.

In [4]:
# The dataframes are joined based on the text string title of the module
modules_targets_2 = cs.joinDataframes(modules_targets, modules_directory, 'Module')
modules_targets_2

# Note NaN values for statistics modules that don't appear on the Computer Science directory of modules

Unnamed: 0,Module,CS & ML,ML,Type,Code,Link
0,Supervised Learning,True,True,Core,COMPGI01,http://www.cs.ucl.ac.uk/current_students/sylla...
1,Statistical Modelling and Data Analysis,True,False,Core,,
2,Graphical Models,True,True,Group One,COMPGI08,http://www.cs.ucl.ac.uk/current_students/sylla...
3,Probabilistic and Unsupervised Learning,True,True,Group One,COMPGI18,http://www.cs.ucl.ac.uk/current_students/sylla...
4,Advanced Deep Learning and Reinforcement Learning,True,True,Group Two,COMPGI22,http://www.cs.ucl.ac.uk/current_students/sylla...
5,Advanced Topics in Machine Learning,True,True,Group Two,COMPGI13,http://www.cs.ucl.ac.uk/current_students/sylla...
6,Applied Machine Learning,True,True,Group Two,COMPGI09,http://www.cs.ucl.ac.uk/current_students/sylla...
7,Approximate Inference and Learning in Probabil...,True,True,Group Two,COMPGI16,http://www.cs.ucl.ac.uk/current_students/sylla...
8,Information Retrieval & Data Mining,True,True,Group Two,COMPGI15,http://www.cs.ucl.ac.uk/current_students/sylla...
9,Introduction to Deep Learning,True,True,Group Two,COMPGI23,http://www.cs.ucl.ac.uk/current_students/sylla...


## Pull Detail from Module Pages
The links acquired are visited to extract detailed data on each module.

In [7]:
details = None
for i in range(len(modules_targets_2)):
    if pd.isnull(modules_targets_2.at[i, 'Link']):
        continue
    else:
        link = modules_targets_2.at[i, 'Link']
        #print(modules_targets_2.at[i, 'Link'])
        soup = cs.collectSoup(link)
        data = cs.parseModuleDetails(soup)
    if details is None:
        details = data
    else:
        details = pd.concat([details, data])
    
print(details)
        

TypeError: Could not compare [None] with block values