# 1. Data collection

### 1.1 Get the list of master's degree courses

In [1]:
from bs4 import BeautifulSoup
import os
import pandas as pd
import time
import random
import glob
from nltk.stem import *
from nltk.corpus import stopwords
from collections import Counter
from functools import reduce
import heapq

In [2]:
from defs import *

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Via the HTTP GET request we retrieve the content of the url of our interest, in our case the page that contains all the masters.

In [4]:
url = 'https://www.findamasters.com/masters-degrees/msc-degrees/'
result = requests.get(url)

To extract all the links of the master's degree of the first 400 pages we used a function, which is located in the *defs.py* file; then we store all the links in the file *masters_urls.txt*.

In [None]:
# to extract all the masters we have to do a for loop for the first 400 pages:

num_pages = 400
pref = 'https://www.findamasters.com'
test_lst_all = []

for i in range(1, num_pages + 1):
    test_lst = extract_masters(pref + '/masters-degrees/msc-degrees/?PG=' + str(i))
    test_lst_all.extend(test_lst)
# creating the txt file of the fisrt 400 pages of ms
with open('masters_urls.txt', 'w') as f:
    for item in test_lst_all:
        f.write(item[0] + '\n')
# file created 

### 1.2 Crawl master's degree pages

We are going to save each HTML page of each course in a different folder, one folder for each page it is in; so we'll obtain 400 folders, each of them will contain 15 HTML files. The urls are taken from the *masters_urls.txt* previusly created.


In [None]:
# settings for the User-Agent to simulate a browser
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
folder_name = "html_pages" # create the folder that will contain the html pages
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

for i in range(1, 401):
    # create a folder for each page, from 1 to 400
    name = 'HTML page ' + str(i)
    path_folder = os.path.join(folder_name, name)
    if not os.path.exists(path_folder):
        os.makedirs(path_folder)

# open the file containing the urls
with open('masters_urls.txt', 'r') as file:
    for index, url in enumerate(file):
        url = url.strip()
        page = (index // 15 ) + 1
        try:
            # the complete url
            full_url = "https://www.findamasters.com" + url.strip()

            # add a delay of 1 to 5 seconds between the requests
            time.sleep(1 + random.uniform(0, 4))
            # request to obtain the content of the url
            response = requests.get(full_url, headers=headers)
            
            if response.status_code == 200:
                # Parsing dell'HTML con BeautifulSoup
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # save the html of the course in a separate file in the folder of the page it belongs to 
                file_path = os.path.join(f"{folder_name}\HTML page {page}", f"course {index+1}.html")
                with open(file_path, "w", encoding="utf-8") as html_file:
                    html_file.write(str(soup))

            else:
                print(f"Errore nel recuperare la pagina del corso: {full_url}")
        
        except Exception as e:
            print(f"Errore durante il recupero e salvataggio della pagina {full_url}: {e}")


### 1.3 Parse download pages

Through the *extcat_msc_page* function (located in the *defs.py*) we parse all the HTML we retrieved before and collect all the information for each master.

In [None]:
#Directory where there are the HTML repositories
html_folder = "\html-pages"
my_path = "D:\Primo Semestre\ADM\HW3"
#List to contain all the information
all_master_info = []

all_url = []

#Iterating whitin the repositories HTML of every page
for page_folder in os.listdir(my_path + html_folder):
    page_path = os.path.join(html_folder, page_folder)
    file_absolute_path = os.path.join(my_path + page_path)

    if os.path.isdir(file_absolute_path):
        #Iterating in the files HTML of every repository
        for file in os.listdir(file_absolute_path):
            if file.endswith(".html"):
                file_path = os.path.join(file_absolute_path, file)
                print('FILE PATH: ', file_path)
                #Applying the function extract_msc_page to every file HTML
                master_info = extract_msc_page(file_path) 
                all_master_info.extend(master_info)
              
print(all_master_info)

Then we're storing those information in a tsv file, one for each master. All the files are stored in a folder called 'Tsv files'. 

In [None]:
# creating the tsv file for each master

folder_name = "Tsv files"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
folder_path = "D:\Primo Semestre\ADM\HW3\Tsv files"

for i in range(0,len(all_master_info)):
    output_file = f"{folder_path}\course_{i+1}.tsv"
    with open(output_file, 'w', encoding='utf-8') as tsvfile:
    # Extract field names from the dictionaries in 'all_master_info'
        fieldnames = all_master_info[i].keys()
        tsvfile.write('\t'.join(fieldnames) + '\n')                  # write the header
        row = '\t'.join(str(all_master_info[i].get(field, '')) for field in fieldnames)
        tsvfile.write(row + '\n')

We can now create our dataframe, reading the data on all the tsv files we just created.

In [None]:
data_frames = []
file_name_list = glob.glob("Tsv files\course_*.tsv")    # take all the course_i.tsv files
for file in file_name_list:
    dF_tsv = pd.read_csv(file, sep='\t', header=0)      # create the data frame from the tsv file
    data_frames.append(dF_tsv)
# Concatenate all DataFrames in the list into a single DataFrame
dataset = pd.concat(data_frames, ignore_index=True)     # creating the whole dataframe from each tsv file

dataset = dataset[dataset.description != ''] # do not ocnsider all the rows that have an empty description

dataset.head()

**da eliminare poi questo**  perchè per semplicitò apriamo il dataset da un file json, ma in realtà dovremmo aprirlo dai file tsv

In [28]:
# opening the json file for the dataset (but it will be the tsv file)
path = r"D:\Primo Semestre\ADM\HW3\university_dataset.json"
dataset= pd.read_json(path)

# 2. Search Engine

### 2.0 Preprocessing

2.0.0 Preprocessing the text\
We created 3 functions, that are present in the *defs.py*, to perform the stemming, remove the stopwords and punctuation from the *description* field of our dataset.

In [30]:
# removing rows where the description is empty
dataset = dataset[dataset['description'].notna()]

In [None]:
# 1. stemming 
dataset['descr_stem'] = dataset['description'].apply(stem_description)

# 2. removing stopwords
dataset['description_clean'] = dataset['descr_stem'].apply(lambda row: remove_stopwords(row))

# 3. removing punctuation
dataset['description_clean'] = dataset['description_clean'].apply(lambda row: remove_punctuation(row))


2.0.1 preprocess the *fees* column

### 2.1 Conjuctive query

2.1.1 Create the index

We created the vocabulary assigning an unique ID to each word encoutered in the description field of the dataset, then created a csv file out of it, to store the information. 

In [33]:
# creating the vocabulary 
vocabulary = Counter(reduce(lambda x,y : x+y, dataset.description_clean)).keys()

# assign an unique ID to each word of the vocabulary using a pandas dataframe
terms = pd.DataFrame(data=list(vocabulary), columns=['term'])

terms
# creating a csv file for the vocabulary with index of each term
terms.to_csv('vocabulary.csv', index_label='term_id')

Now we can create the inverted index as a new column of the dataframe *terms* and store it in a txt file, called *Inverted Index.txt*

In [34]:
terms = pd.read_csv('vocabulary.csv')

In [35]:
terms['reverse'] = terms.term.apply(lambda item: list(dataset.loc[dataset.description_clean.apply(lambda row: item in row)].index))
terms.head()

Unnamed: 0,term_id,term,reverse
0,0,3d,"[0, 444, 508, 593, 594, 890, 1838, 2437, 2833,..."
1,1,visualis,"[0, 68, 70, 399, 741, 1283, 1299, 1430, 1431, ..."
2,2,anim,"[0, 9, 20, 241, 681, 969, 1028, 1029, 1030, 10..."
3,3,play,"[0, 16, 33, 70, 80, 182, 194, 273, 305, 318, 3..."
4,4,role,"[0, 35, 61, 70, 74, 80, 138, 161, 163, 172, 17..."


We now transform the inverted index in a dictionary in this form\
<code> {
term_id_1:[document_1, document_2, document_4],
term_id_2:[document_1, document_3, document_5, document_6],
    ...}
</code>

In [36]:
InvertedIndex = terms['reverse'].to_dict()
print(InvertedIndex) # the dictionary of the inverted index 

# store the inverted index in a txt file
with open('Inverted Index.txt', 'w') as file:

    for key, value in InvertedIndex.items():
        file.write(f'{key}: {value}\n')
file.close()

{0: [0, 444, 508, 593, 594, 890, 1838, 2437, 2833, 2835, 2836, 3917, 4006, 5000, 5764], 1: [0, 68, 70, 399, 741, 1283, 1299, 1430, 1431, 1434, 1672, 1687, 1724, 1725, 2118, 2267, 2322, 2613, 2620, 2637, 2643, 2686, 2696, 3963, 3992, 4197, 5120, 5122, 5440, 5510, 5570], 2: [0, 9, 20, 241, 681, 969, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1049, 1054, 1114, 1213, 1217, 1219, 1220, 1224, 1232, 1239, 1411, 1442, 1490, 1520, 1604, 1605, 1972, 2161, 2193, 2913, 2948, 2984, 3514, 3542, 3545, 3575, 3791, 3826, 3836, 3892, 3917, 4330, 4333, 4337, 4356, 4423, 4572, 4606, 4607, 4961, 5080, 5498, 5773, 5868, 5907, 5947], 3: [0, 16, 33, 70, 80, 182, 194, 273, 305, 318, 345, 389, 431, 448, 490, 503, 610, 628, 723, 780, 781, 801, 816, 822, 865, 912, 929, 957, 975, 1037, 1045, 1067, 1100, 1231, 1232, 1236, 1329, 1365, 1443, 1562, 1580, 1585, 1616, 1624, 1690, 1723, 1783, 1795, 1800, 1806, 1825, 1833, 1874, 1931, 2105, 2160, 2271, 2272, 2356, 2363, 2614, 2681,

In [37]:
# read back the inverted index from the file.

file = open("Inverted Index.txt", "r")

inv_indx = dict()
txt = file.read().split("\n")

for i in range(len(txt)-1):
    line = txt[i].replace(":", "").replace("[", "").replace("]", "").replace("(", "").replace(")", "").replace(",", "").split(" ")
    inv_indx[int(line[0])] = []
    for j in range(1, len(line)):
            inv_indx[int(line[0])].append(int(line[j]))
            
file.close()

2.1.2 Execute the query

We created a function called *query_preprocess* that preprocesses the query just like we did in the preprocess of the description field.

In [38]:
#query = str(input())
query = 'advanced knoledge'
# formatting the query 
query = query_preprocess(query) 
print(query)

['advanc', 'knoledg']


What we're going to do now to implement our Search Engine is:
- Find all the words of the query in the vocabulary and exctract each *term_id* of each word of the query.
- Find all the documents related to each *term_id* in the Inverted Index.
- Do the intersection of the lists of documents found.

In [39]:
vocabulary = pd.read_csv('vocabulary.csv') # read the vocabulary file into a dataframe
vocabulary = pd.DataFrame(vocabulary)

file = open("Inverted Index.txt", "r") # read the inverted index from the file.

inv_indx = dict()
txt = file.read().split("\n")

for i in range(len(txt)-1):
    line = txt[i].replace(":", "").replace("[", "").replace("]", "").replace("(", "").replace(")", "").replace(",", "").split(" ")
    inv_indx[int(line[0])] = []
    for j in range(1, len(line)):
            inv_indx[int(line[0])].append(int(line[j]))    
file.close()

# find the words of the query in the vocabulary
for w in query:
    term_ids = [vocabulary[vocabulary['term'] == w]['term_id'].values for w in query if w in vocabulary.term.values]

term_ids = [term_ids[x][0] for x in range(len(term_ids))] # exctract only the integers of the ids

# find the documents 
docs = [inv_indx[i] for i in term_ids]

# intersecting the two sets of documents we found out contain all the word of the query
intersection = list(set(docs[0]).intersection(*docs[1:]))
print('All the documents of the result:',intersection)

All the documents of the result: [1, 2, 4, 5, 6, 7, 8, 4103, 12, 4119, 4123, 4127, 4131, 36, 39, 42, 48, 4150, 4151, 4153, 4173, 4174, 4176, 87, 88, 4184, 4192, 97, 4198, 4200, 4206, 4207, 4208, 113, 115, 121, 126, 129, 4230, 4241, 4242, 148, 150, 4255, 170, 4270, 4272, 4273, 180, 4278, 4292, 197, 4293, 4296, 201, 4297, 203, 4300, 4307, 214, 215, 4311, 4313, 4318, 4326, 235, 4331, 237, 238, 241, 245, 248, 252, 254, 258, 4356, 4359, 266, 4363, 4367, 4369, 4370, 275, 4376, 4382, 4389, 296, 4395, 304, 4406, 4411, 337, 339, 341, 343, 344, 4442, 347, 4445, 357, 358, 4454, 4457, 4459, 4463, 370, 4469, 381, 382, 4478, 4479, 389, 4485, 4487, 392, 4491, 4501, 4503, 4504, 4505, 4506, 4507, 414, 417, 4514, 419, 4518, 4519, 4520, 445, 449, 4555, 461, 4557, 4558, 4559, 469, 4576, 482, 4578, 491, 497, 4597, 4599, 506, 507, 508, 4602, 4603, 4604, 515, 4612, 518, 519, 4615, 527, 4626, 534, 546, 4644, 555, 556, 569, 575, 585, 587, 591, 594, 4700, 607, 610, 611, 612, 613, 616, 617, 618, 619, 620, 622, 6

Now we can show the results of the query after it passed into the search engine

In [40]:
information_needed = ['courseName','universityName','description', 'url']
dataset.loc[intersection,information_needed] 

Unnamed: 0,courseName,universityName,description,url
1,Accounting and Finance - MSc,University of Leeds,Businesses and governments rely on sound finan...,https://www.findamasters.com/masters-degrees/c...
2,"Accounting, Accountability & Financial Managem...",King’s College London,"Our Accounting, Accountability & Financial Man...",https://www.findamasters.com/masters-degrees/c...
4,Addictions MSc,King’s College London,Join us for an online session for prospective ...,https://www.findamasters.com/masters-degrees/c...
5,Advanced Chemical Engineering - MSc,University of Leeds,The Advanced Chemical Engineering MSc at Leeds...,https://www.findamasters.com/masters-degrees/c...
6,Advanced Master in Financial Markets,Solvay Brussels School,Programme overviewThe Advanced Master in Finan...,https://www.findamasters.com/masters-degrees/c...
...,...,...,...,...
4019,Glaciology MSc by Research,Swansea University,The MSc by Research in Glaciology allows you t...,https://www.findamasters.com/masters-degrees/c...
4022,Global Ageing MSc (Online),University of Stirling,"According to the WHO, between 2015 and 2050 th...",https://www.findamasters.com/masters-degrees/c...
4023,Global Biodiversity Conservation - MSc,University of Sussex,This MSc will give you advanced knowledge and ...,https://www.findamasters.com/masters-degrees/c...
4069,Global Health MSc,"St George’s, University of London",Significant socioeconomic and environmental ch...,https://www.findamasters.com/masters-degrees/c...


## 2.2 Conjunctive query & Ranking score

2.2.1 Inverted Index

We are now going to implement a ranking system, computing the *TF-IDF* for each word in each document, and then calculating the *cosine similarity* between the query vector and each one of the vectors corresponding to the documents.

In [41]:
# tf-idf 
# use the library scikit-learn: tfidf implementation VECTORIZED 
from sklearn.feature_extraction.text import TfidfVectorizer 

In [42]:
#Convert a collection of raw documents to a matrix of TF-IDF features

tfidf = TfidfVectorizer(input='content', lowercase=False, tokenizer=lambda text: text)
results = tfidf.fit_transform(dataset.description_clean) # fit data to train our model (but in our case is the same dataset)
results_dense = results.todense() # results are sparse documents that i want to convert into a dense one

# putting all into a dataframe where the index of the dataframe is each document id
tfidf_data = pd.DataFrame(results_dense.tolist(), index=dataset.index, columns=tfidf.get_feature_names_out()) 



Creating our second inverted index in the form:\
<code>{
term_id_1:[(document1, tfIdf_{term,document1}), (document2, tfIdf_{term,document2}), (document4, tfIdf_{term,document4}), ...],
term_id_2:[(document1, tfIdf_{term,document1}), (document3, tfIdf_{term,document3}), (document5, tfIdf_{term,document5}), (document6, tfIdf_{term,document6}), ...],
...}
</code>
And then storing it in a txt file called *Extented Inverted Index.txt*

In [43]:
# creating the second inverted index
from collections import defaultdict
extended_inverted_index = defaultdict(list)

# Iterate through each term in the inverted index
for term_id, doc_indices in inv_indx.items():
    # Iterate through each document index for the current term
    
    for doc_index in doc_indices:
        # Get the TF-IDF scores for the current document 
        word = vocabulary[vocabulary['term_id'] == term_id]['term'].values
     
        if  word[0] in tfidf_data.columns: # check if the word is in the tfidf
                tfidf_scores = tfidf_data.loc[doc_index,word[0]]
        else:
                continue

        # Append a tuple of (document_index, TF-IDF scores) to the term's list in the extended inverted index
        extended_inverted_index[term_id].append((doc_index, tfidf_scores))

# Convert the extended inverted index defaultdict to a regular dictionary
extended_inverted_index = dict(extended_inverted_index)
print(extended_inverted_index)

# save the extended inverted dictionary in a txt file as before
with open('Extended Inverted Index.txt', 'w') as file:

    for key, value in extended_inverted_index.items():
        file.write(f'{key}: {value}\n')
file.close()

{0: [(0, 0.5131004658100592), (444, 0.13259854288498288), (508, 0.24846179722063458), (593, 0.1586648186377345), (594, 0.15827962867707787), (890, 0.4055552090983382), (1838, 0.1386016612138663), (2437, 0.16117061915020678), (2833, 0.15877920328214626), (2835, 0.13087937488256954), (2836, 0.13059448878930194), (3917, 0.11938184785105894), (4006, 0.2711325639873365), (5000, 0.1654450127686348), (5764, 0.1276962585867652)], 1: [(0, 0.11543237617167537), (68, 0.15239800340906415), (70, 0.13224489163417197), (399, 0.2029563600774623), (741, 0.12222801088657478), (1283, 0.11276626859697529), (1299, 0.16000007419114928), (1430, 0.15390519505985986), (1431, 0.14004495036986234), (1434, 0.18919878727703612), (1672, 0.14949487516493934), (1687, 0.14347080354002173), (1724, 0.19071590375747957), (1725, 0.18861782685621512), (2118, 0.13838507517158438), (2267, 0.10661430553523547), (2322, 0.3342866649519662), (2613, 0.1727457595020762), (2620, 0.12127411499595099), (2637, 0.31649036879622794), (2

In [44]:
# read the inverted index from the file.

file = open("Extended Inverted Index.txt", "r")

ext_inv_indx = dict()
txt = file.read().split("\n")

for i in range(len(txt)-1):
    line = txt[i].replace(":", "").replace("[", "").replace("]", "").replace("(", "").replace(")", "").replace(",", "").split(" ")
    ext_inv_indx[int(line[0])] = []
    for j in range(1, len(line)):
        if j%2 == 1:
            ext_inv_indx[int(line[0])].append((int(line[j]), float(line[j+1])))
            
file.close()

2.2.2 Execute the query

We created the query vector, putting a 1 if the word corresponding to the position is present in the query, 0 if is not.\
The vector for each description is each row of the dataframe tfidf, so no need to compute it for each document.

In [45]:
# create a vector for the query
query_vec = np.zeros(vocabulary.shape[0]) # inizialize the vector
for word in query:
    if word in tfidf_data.columns:
        term_id = vocabulary[vocabulary['term'] == word]['term_id']
        query_vec[term_id] = 1.0

# the documents matrix with all the tfidf is the dataframe tfidf

Now we need the cosine similarity function that we wrote in the *defs.py*, which simply exploits the definition of the cosine similaruty between two vectors that create the $\phi$ angle:

$cos(\phi) = \frac{\vec{q} \cdot \vec{d}}{|{\vec{q}}| \cdot |{\vec{d}}|}$

Instead of using a list sto store the similarity scores, we can use a heap structure, to make the sorting more efficient from a computational point of view.

In [52]:
heap = []
scores_dictionary = {}

# For every document
for doc_index in range(tfidf_data.shape[0]):
    if doc_index not in tfidf_data.index:
        continue
    doc_arr = tfidf_data.loc[doc_index, :].values
    # Compute the angle between the doc and the query vector
    cos_sim = a_cosine_similarity(query_vec, doc_arr)

    # Put the result in the dictionary
    scores_dictionary[doc_index] = cos_sim

    # Update the heap
    heapq.heappush(heap, (cos_sim, doc_index))  # Store both score and document index in the heap
print(scores_dictionary)

{0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 0.0, 23: 0.0, 24: 0.0, 25: 0.0, 26: 0.0, 27: 0.0, 28: 0.0, 29: 0.0, 30: 0.0, 31: 0.0, 32: 0.0, 33: 0.0, 34: 0.0, 35: 0.0, 36: 0.0, 37: 0.0, 38: 0.0, 39: 0.0, 40: 0.0, 41: 0.0, 42: 0.0, 43: 0.0, 44: 0.0, 45: 0.0, 46: 0.0, 47: 0.0, 48: 0.0, 49: 0.0, 50: 0.0, 51: 0.0, 52: 0.0, 53: 0.0, 54: 0.0, 55: 0.0, 56: 0.0, 57: 0.0, 58: 0.0, 59: 0.0, 60: 0.0, 61: 0.0, 62: 0.0, 63: 0.0, 64: 0.0, 65: 0.0, 66: 0.0, 67: 0.0, 68: 0.0, 69: 0.0, 70: 0.0, 71: 0.0, 72: 0.0, 73: 0.0, 74: 0.0, 75: 0.0, 76: 0.0, 77: 0.0, 78: 0.0, 79: 0.0, 80: 0.0, 81: 0.0, 82: 0.0, 83: 0.0, 84: 0.0, 85: 0.0, 86: 0.0, 87: 0.0, 88: 0.0, 89: 0.0, 90: 0.0, 91: 0.0, 92: 0.0, 93: 0.0, 94: 0.0, 95: 0.0, 96: 0.0, 97: 0.0, 98: 0.0, 99: 0.0, 100: 0.0, 101: 0.0, 102: 0.0, 103: 0.0, 104: 0.0, 105: 0.0, 106: 0.0, 107: 0.0, 108: 0.0, 109: 0.0, 110: 0.0,

2.2.2 Execute the query with k = 6

In [53]:
k = 6
top_k = heapq.nlargest(k, heap)
print(top_k)

top_doc_k = []

#fill the list of top_k_doc
for score, doc in top_k:
    top_doc_k.append(doc)
print(top_doc_k)

[(0.20374239362170501, 5209), (0.0, 5976), (0.0, 5975), (0.0, 5974), (0.0, 5973), (0.0, 5972)]
[5209, 5976, 5975, 5974, 5973, 5972]


The results of the query

In [54]:
# adding the column 'similarity score' to the dataset
rinformation_needed = ['courseName','universityName','description', 'url']
results = dataset.loc[top_doc_k , information_needed] 
results['similarity'] = [round(s[0],3) for s in top_k]
results

Unnamed: 0,courseName,universityName,description,url,similarity
5209,Management (International Business) - MSc,University of Reading,"On this Masters programme, you will examine th...",https://www.findamasters.com/masters-degrees/c...,0.204
5976,Masters's in Digital Politics and Governance,European School of Political and Social Scienc...,Digitalisation is a critical issue in today’s ...,https://www.findamasters.com/masters-degrees/c...,0.0
5975,"Masters Program in Climate Change, Agriculture...",University of Galway,The world’s climate is rapidly changing due to...,https://www.findamasters.com/masters-degrees/c...,0.0
5974,"Masters of Science in Business, Supply Chain A...",Oregon State University,Master of Science in Business (MSB)Our Master ...,https://www.findamasters.com/masters-degrees/c...,0.0
5973,Masters of Science in Business,Oregon State University,Our Master of Science in Business (MSB) will g...,https://www.findamasters.com/masters-degrees/c...,0.0
5972,Master's of Front-end Development,Harbour.Space University,Front-end Development at Harbour.Space Univers...,https://www.findamasters.com/masters-degrees/c...,0.0


# 5. Bonus: More complex search engine 

1. Give the possibility to specify queries for the following features (the user should have the option to issue *none or all of them*): [courseName, universityName, universityCity]