# 1. Data collection

### 1.1 Get the list of master's degree courses

In [None]:
from bs4 import BeautifulSoup
import os
import pandas as pd
import time
import random
import glob
from nltk.stem import *
from nltk.corpus import stopwords
from collections import Counter
from functools import reduce
import heapq
import re
from forex_python.converter import CurrencyRates

In [None]:
from defs import *

Via the HTTP GET request we retrieve the content of the url of our interest, in our case the page that contains all the masters.

In [None]:
url = 'https://www.findamasters.com/masters-degrees/msc-degrees/'
result = requests.get(url)

To extract all the links of the master's degree of the first 400 pages we used a function, which is located in the *defs.py* file; then we store all the links in the file *masters_urls.txt*.

In [None]:
# to extract all the masters we have to do a for loop for the first 400 pages:

num_pages = 400
pref = 'https://www.findamasters.com'
test_lst_all = []

for i in range(1, num_pages + 1):
    test_lst = extract_masters(pref + '/masters-degrees/msc-degrees/?PG=' + str(i))
    test_lst_all.extend(test_lst)
# creating the txt file of the fisrt 400 pages of ms
with open('masters_urls.txt', 'w') as f:
    for item in test_lst_all:
        f.write(item[0] + '\n')
# file created

### 1.2 Crawl master's degree pages

We are going to save each HTML page of each course in a different folder, one folder for each page it is in; so we'll obtain 400 folders, each of them will contain 15 HTML files. The urls are taken from the *masters_urls.txt* previusly created.


In [None]:
# settings for the User-Agent to simulate a browser
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
folder_name = "html_pages" # create the folder that will contain the html pages
html_name_url = {}
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

for i in range(1, 401):
    # create a folder for each page, from 1 to 400
    name = 'HTML page ' + str(i)
    path_folder = os.path.join(folder_name, name)
    if not os.path.exists(path_folder):
        os.makedirs(path_folder)

# open the file containing the urls
with open('masters_urls.txt', 'r') as file:
    for index, url in enumerate(file):
        url = url.strip()
        page = (index // 15 ) + 1
        try:
            # the complete url
            full_url = "https://www.findamasters.com" + url.strip()

            # add a delay of 1 to 5 seconds between the requests
            time.sleep(1 + random.uniform(0, 4))
            # request to obtain the content of the url
            response = requests.get(full_url, headers=headers)

            if response.status_code == 200:
                # Parsing dell'HTML con BeautifulSoup
                soup = BeautifulSoup(response.content, 'html.parser')

                # save the html of the course in a separate file in the folder of the page it belongs to
                file_path = os.path.join(f"{folder_name}\HTML page {page}", f"course {index+1}.html")
                with open(file_path, "w", encoding="utf-8") as html_file:
                    html_file.write(str(soup))
                html_name_url[file_path] = full_url
            else:
                print(f"Errore nel recuperare la pagina del corso: {full_url}")

        except Exception as e:
            print(f"Errore durante il recupero e salvataggio della pagina {full_url}: {e}")


### 1.3 Parse download pages

Through the *extcat_msc_page* function (located in the *defs.py*) we parse all the HTML we retrieved before and collect all the information for each master.

In [None]:
#Directory where there are the HTML repositories
html_folder = "\html-pages"
my_path = "D:\Primo Semestre\ADM\HW3"
#List to contain all the information
all_master_info = []

all_url = []

#Iterating whitin the repositories HTML of every page
for page_folder in os.listdir(my_path + html_folder):
    page_path = os.path.join(html_folder, page_folder)
    file_absolute_path = os.path.join(my_path + page_path)

    if os.path.isdir(file_absolute_path):
        #Iterating in the files HTML of every repository
        for file in os.listdir(file_absolute_path):
            if file.endswith(".html"):
                file_path = os.path.join(file_absolute_path, file)
                print('FILE PATH: ', file_path)
                #Applying the function extract_msc_page to every file HTML
                master_info = extract_msc_page(file_path)
                all_master_info.extend(master_info)

print(all_master_info)

Then we're storing those information in a tsv file, one for each master. All the files are stored in a folder called 'Tsv files'.

In [None]:
# creating the tsv file for each master

folder_name = "Tsv files"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
folder_path = "D:\Primo Semestre\ADM\HW3\Tsv files"

for i in range(0,len(all_master_info)):
    output_file = f"{folder_path}\course_{i+1}.tsv"
    with open(output_file, 'w', encoding='utf-8') as tsvfile:
    # Extract field names from the dictionaries in 'all_master_info'
        fieldnames = all_master_info[i].keys()
        tsvfile.write('\t'.join(fieldnames) + '\n')                  # write the header
        row = '\t'.join(str(all_master_info[i].get(field, '')) for field in fieldnames)
        tsvfile.write(row + '\n')

We can now create our dataframe, reading the data on all the tsv files we just created.

In [None]:
data_frames = []
file_name_list = glob.glob("Tsv files\course_*.tsv")    # take all the course_i.tsv files
for file in file_name_list:
    dataset_tsv = pd.read_csv(file, sep='\t', header=0)      # create the data frame from the tsv file
    data_frames.append(dataset_tsv)
# Concatenate all DataFrames in the list into a single DataFrame
dataset = pd.concat(data_frames, ignore_index=True)     # creating the whole dataframe from each tsv file

dataset = dataset[dataset.description != ''] # do not ocnsider all the rows that have an empty description

dataset.head()

**da eliminare poi questo**  perchè per semplicitò apriamo il dataset da un file json, ma in realtà dovremmo aprirlo dai file tsv

In [None]:
# opening the json file for the dataset (but it will be the tsv file)
path = r"university_dataset.json"
dataset= pd.read_json(path)
dataset.dropna(subset=['description'], inplace=True)

# 2. Search Engine

### 2.0 Preprocessing

2.0.0 Preprocessing the text\
We created 3 functions, that are present in the *defs.py*, to perform the stemming, remove the stopwords and punctuation from the *description* field of our dataset.

In [None]:
# removing rows where the description is empty
dataset = dataset[dataset['description'].notna()]
dataset = dataset[dataset['fees'].notna()]

In [None]:
# 1. stemming
dataset['descr_stem'] = dataset['description'].apply(stem_description)

# 2. removing stopwords
dataset['description_clean'] = dataset['descr_stem'].apply(lambda row: remove_stopwords(row))

# 3. removing punctuation
dataset['description_clean'] = dataset['description_clean'].apply(lambda row: remove_punctuation(row))


2.0.1 preprocess the *fees* column

get value of cost, from fees colum

In [None]:
university_cost = {}
pattern = r'\b(?:\$\s?|€|£|¥|₹|\b(?:USD|EUR|GBP|JPY|INR)\b|\b(?:dollari|euro|sterline|yen|rupie)\b|\b(?:dollar|euro|pound|yen|rupee)\b)\s?([\d,]+(?:\.\d{1,2})?)\b'
for index, i in enumerate(dataset['fees']):
    if isinstance(i, str):
        corrispondenze = re.finditer(pattern, i, flags=re.IGNORECASE)
        valori_monetari = [(match.group(1), match.group(0)) for match in corrispondenze]

        # Trova la valuta e il costo con il valore monetario più alto
        if valori_monetari:
            costo, valuta = max(valori_monetari, key=lambda x: float(x[0].replace(',', '')))

            # Estrai solo il simbolo della valuta
            simbolo_valuta_match = re.search(r'(£|\$|€|¥|₹|\b(?:USD|EUR|GBP|JPY|INR)\b|\b(?:dollari|euro|sterline|yen|rupie)\b|\b(?:dollar|euro|pound|yen|rupee)\b)', valuta)

            # Verifica se c'è una corrispondenza prima di chiamare group
            simbolo_valuta = simbolo_valuta_match.group(1) if simbolo_valuta_match else None

            university_cost[index] = {'costo': costo, 'valuta': simbolo_valuta}
        else:
            university_cost[index] = None


In [None]:
dataset['fees'] = university_cost

We're using the functions defined in the *defs* file to handle the fees curency.

In [None]:
# apply the function to the 'fees' column
dataset['fees'] = dataset['fees'].apply(convert_and_replace)

In [None]:
dataset = dataset.rename(columns={'fees': 'fees (EUR)'})
#dataset = dataset[dataset['fees (EUR)'].notna()]

### 2.1 Conjuctive query

2.1.1 Create the index

We created the vocabulary assigning an unique ID to each word encoutered in the description field of the dataset, then created a csv file out of it, to store the information.

In [None]:
# creating the vocabulary
vocabulary = Counter(reduce(lambda x,y : x+y, dataset.description_clean)).keys()

# assign an unique ID to each word of the vocabulary using a pandas dataframe
terms = pd.DataFrame(data=list(vocabulary), columns=['term'])

terms
# creating a csv file for the vocabulary with index of each term
terms.to_csv('vocabulary.csv', index_label='term_id')

Now we can create the inverted index as a new column of the dataframe *terms* and store it in a txt file, called *Inverted Index.txt*

In [None]:
terms = pd.read_csv('vocabulary.csv')

In [None]:
terms['reverse'] = terms.term.apply(lambda item: list(dataset.loc[dataset.description_clean.apply(lambda row: item in row)].index))
terms.head()

Unnamed: 0,term_id,term,reverse
0,0,3d,"[0, 444, 508, 593, 594, 890, 1838, 2437, 2833,..."
1,1,visualis,"[0, 68, 70, 399, 741, 1283, 1299, 1430, 1431, ..."
2,2,anim,"[0, 9, 20, 241, 681, 969, 1028, 1029, 1030, 10..."
3,3,play,"[0, 16, 33, 70, 80, 182, 194, 273, 305, 318, 3..."
4,4,role,"[0, 35, 61, 70, 74, 80, 138, 161, 163, 172, 17..."


We now transform the inverted index in a dictionary in this form\
<code> {
term_id_1:[document_1, document_2, document_4],
term_id_2:[document_1, document_3, document_5, document_6],
    ...}
</code>

In [None]:
InvertedIndex = terms['reverse'].to_dict()

# store the inverted index in a txt file
with open('Inverted Index.txt', 'w') as file:

    for key, value in InvertedIndex.items():
        file.write(f'{key}: {value}\n')
file.close()

In [None]:
# read back the inverted index from the file.

file = open("Inverted Index.txt", "r")

inv_indx = dict()
txt = file.read().split("\n")

for i in range(len(txt)-1):
    line = txt[i].replace(":", "").replace("[", "").replace("]", "").replace("(", "").replace(")", "").replace(",", "").split(" ")
    inv_indx[int(line[0])] = []
    for j in range(1, len(line)):
            inv_indx[int(line[0])].append(int(line[j]))

file.close()

2.1.2 Execute the query

We created a function called *query_preprocess* that preprocesses the query just like we did in the preprocess of the description field.

In [None]:
#query = str(input())
starting_query = 'advanced knoledge'
print('starting query:',starting_query)
# formatting the query
query = query_preprocess(starting_query)
print('preprocessed query:',query)

starting query: advanced knoledge
preprocessed query: ['advanc', 'knoledg']


What we're going to do now to implement our Search Engine is:
- Find all the words of the query in the vocabulary and exctract each *term_id* of each word of the query.
- Find all the documents related to each *term_id* in the Inverted Index.
- Do the intersection of the lists of documents found.

In [None]:
vocabulary = pd.read_csv('vocabulary.csv') # read the vocabulary file into a dataframe
vocabulary = pd.DataFrame(vocabulary)

file = open("Inverted Index.txt", "r") # read the inverted index from the file.

inv_indx = dict()
txt = file.read().split("\n")

for i in range(len(txt)-1):
    line = txt[i].replace(":", "").replace("[", "").replace("]", "").replace("(", "").replace(")", "").replace(",", "").split(" ")
    inv_indx[int(line[0])] = []
    for j in range(1, len(line)):
            inv_indx[int(line[0])].append(int(line[j]))
file.close()

# find the words of the query in the vocabulary
for w in query:
    term_ids = [vocabulary[vocabulary['term'] == w]['term_id'].values for w in query if w in vocabulary.term.values]

term_ids = [term_ids[x][0] for x in range(len(term_ids))] # exctract only the integers of the ids

# find the documents
docs = [inv_indx[i] for i in term_ids]

# intersecting the two sets of documents we found out contain all the word of the query
intersection = list(set(docs[0]).intersection(*docs[1:]))

Now we can show the results of the query after it passed into the search engine

In [None]:
information_needed = ['courseName','universityName','description', 'url']
dataset.loc[intersection,information_needed]

Unnamed: 0,courseName,universityName,description,url
1,Accounting and Finance - MSc,University of Leeds,Businesses and governments rely on sound finan...,https://www.findamasters.com/masters-degrees/c...
2,"Accounting, Accountability & Financial Managem...",King’s College London,"Our Accounting, Accountability & Financial Man...",https://www.findamasters.com/masters-degrees/c...
4,Addictions MSc,King’s College London,Join us for an online session for prospective ...,https://www.findamasters.com/masters-degrees/c...
5,Advanced Chemical Engineering - MSc,University of Leeds,The Advanced Chemical Engineering MSc at Leeds...,https://www.findamasters.com/masters-degrees/c...
6,Advanced Master in Financial Markets,Solvay Brussels School,Programme overviewThe Advanced Master in Finan...,https://www.findamasters.com/masters-degrees/c...
...,...,...,...,...
4019,Glaciology MSc by Research,Swansea University,The MSc by Research in Glaciology allows you t...,https://www.findamasters.com/masters-degrees/c...
4022,Global Ageing MSc (Online),University of Stirling,"According to the WHO, between 2015 and 2050 th...",https://www.findamasters.com/masters-degrees/c...
4023,Global Biodiversity Conservation - MSc,University of Sussex,This MSc will give you advanced knowledge and ...,https://www.findamasters.com/masters-degrees/c...
4069,Global Health MSc,"St George’s, University of London",Significant socioeconomic and environmental ch...,https://www.findamasters.com/masters-degrees/c...


## 2.2 Conjunctive query & Ranking score

2.2.1 Inverted Index

We are now going to implement a ranking system, computing the *TF-Idataset* for each word in each document, and then calculating the *cosine similarity* between the query vector and each one of the vectors corresponding to the documents.

In [None]:
# tf-idataset
# use the library scikit-learn: tfidataset implementation VECTORIZED
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Convert a collection of raw documents to a matrix of TF-Idataset features

tfidataset = TfidfVectorizer(input='content', lowercase=False, tokenizer=lambda text: text)
results = tfidataset.fit_transform(dataset.description_clean) # fit data to train our model (but in our case is the same dataset)
results_dense = results.todense() # results are sparse documents that i want to convert into a dense one

# putting all into a dataframe where the index of the dataframe is each document id
tfidataset_data = pd.DataFrame(results_dense.tolist(), index=dataset.index, columns=tfidataset.get_feature_names_out())

Creating our second inverted index in the form:\
<code>{
term_id_1:[(document1, tfIdataset_{term,document1}), (document2, tfIdataset_{term,document2}), (document4, tfIdataset_{term,document4}), ...],
term_id_2:[(document1, tfIdataset_{term,document1}), (document3, tfIdataset_{term,document3}), (document5, tfIdataset_{term,document5}), (document6, tfIdataset_{term,document6}), ...],
...}
</code>
And then storing it in a txt file called *Extented Inverted Index.txt*\
The functions we used are defined in the *defs.py* file

In [None]:
create_second_inverted_index(inv_indx, vocabulary, tfidataset_data, feat='description')

In [None]:
# read the inverted index from the file.
ext_inv_indx = read_inverted_index('description')

2.2.2 Execute the query

We created the query vector, putting a 1 if the word corresponding to the position is present in the query, 0 if is not.\
The vector for each description is each row of the dataframe tfidataset, so no need to compute it for each document.

In [None]:
query_vec = create_vector_query(query=query, vocabulary=vocabulary, tfidataset_data=tfidataset_data )

Now we need the cosine similarity function that we wrote in the *defs.py*, which simply exploits the definition of the cosine similaruty between two vectors that create the $\phi$ angle:

$cos(\phi) = \frac{\vec{q} \cdot \vec{d}}{|{\vec{q}}| \cdot |{\vec{d}}|}$

Instead of using a list to store the similarity scores, we can use a heap structure, to make the sorting more efficient from a computational point of view.

In [None]:
heap = []
scores_dictionary = {}

# For every document
for doc_index in range(tfidataset_data.shape[0]):
    if doc_index in tfidataset_data.index:
        doc_arr = tfidataset_data.loc[doc_index, :].values
        # Compute the angle between the doc and the query vector
        cos_sim = a_cosine_similarity(query_vec, doc_arr)

        # Put the result in the dictionary
        scores_dictionary[doc_index] = cos_sim
        # Update the heap
        heapq.heappush(heap, (cos_sim, doc_index))  # Store both score and document index in the heap
    else:
        continue

2.2.2 Execute the query with k = 6

In [None]:
k = 6
top_k, top_doc_k = execute_query(k, heap)

[(0.20374239362170501, 5209), (0.0, 5976), (0.0, 5975), (0.0, 5974), (0.0, 5973), (0.0, 5972)]
[5209, 5976, 5975, 5974, 5973, 5972]


The results of the query

In [None]:
# adding the column 'similarity score' to the dataset
rinformation_needed = ['courseName','universityName','description', 'url']
results = dataset.loc[top_doc_k , information_needed]
results['similarity'] = [round(s[0],3) for s in top_k]
results

Unnamed: 0,courseName,universityName,description,url,similarity
5209,Management (International Business) - MSc,University of Reading,"On this Masters programme, you will examine th...",https://www.findamasters.com/masters-degrees/c...,0.204
5976,Masters's in Digital Politics and Governance,European School of Political and Social Scienc...,Digitalisation is a critical issue in today’s ...,https://www.findamasters.com/masters-degrees/c...,0.0
5975,"Masters Program in Climate Change, Agriculture...",University of Galway,The world’s climate is rapidly changing due to...,https://www.findamasters.com/masters-degrees/c...,0.0
5974,"Masters of Science in Business, Supply Chain A...",Oregon State University,Master of Science in Business (MSB)Our Master ...,https://www.findamasters.com/masters-degrees/c...,0.0
5973,Masters of Science in Business,Oregon State University,Our Master of Science in Business (MSB) will g...,https://www.findamasters.com/masters-degrees/c...,0.0
5972,Master's of Front-end Development,Harbour.Space University,Front-end Development at Harbour.Space Univers...,https://www.findamasters.com/masters-degrees/c...,0.0


# 3. Define a new score!

3.1

In [None]:
query_for_new_score = str(input())
# formatting the query
query_for_new_score = query_preprocess(query_for_new_score)
print(query_for_new_score)

['univers']


In [None]:
vocabulary = pd.read_csv('vocabulary.csv') # read the vocabulary file into a dataframe
vocabulary = pd.DataFrame(vocabulary)

file = open("Inverted Index.txt", "r") # read the inverted index from the file.

inv_indx = dict()
txt = file.read().split("\n")

for i in range(len(txt)-1):
    line = txt[i].replace(":", "").replace("[", "").replace("]", "").replace("(", "").replace(")", "").replace(",", "").split(" ")
    inv_indx[int(line[0])] = []
    for j in range(1, len(line)):
            inv_indx[int(line[0])].append(int(line[j]))
file.close()

# find the words of the query in the vocabulary
for w in query_for_new_score:
    term_ids = [vocabulary[vocabulary['term'] == w]['term_id'].values for w in query_for_new_score if w in vocabulary.term.values]

term_ids = [term_ids[x][0] for x in range(len(term_ids))] # exctract only the integers of the ids

# find the documents
docs = [inv_indx[i] for i in term_ids]

# intersecting the two sets of documents we found out contain all the word of the query
intersection = list(set(docs[0]).intersection(*docs[1:]))

In [None]:
information_needed = ['courseName','universityName','description', 'url']
dataset.loc[intersection,information_needed]

Unnamed: 0,courseName,universityName,description,url
2051,Clinical Pharmacy Practice (PgCert/PgDip/MSc),Robert Gordon University,The online MSc Clinical Pharmacy Practice cour...,https://www.findamasters.com/masters-degrees/c...
4099,Global MBA,London School of Business & Finance,Global MBA OnlineWhen you study with London Sc...,https://www.findamasters.com/masters-degrees/c...
4100,Global Media and Communications (LSE and Fudan...,London School of Economics and Political Science,Ask LSEThe unique MSc double degree in Global ...,https://www.findamasters.com/masters-degrees/c...
4101,Global Media and Communications (LSE and UCT) MSc,London School of Economics and Political Science,Ask LSEThis unique double degree allows studen...,https://www.findamasters.com/masters-degrees/c...
4102,Global Media and Communications (LSE and USC) MSc,London School of Economics and Political Science,Ask LSEThis unique double degree enables you t...,https://www.findamasters.com/masters-degrees/c...
...,...,...,...,...
2032,"Clinical Nutrition MSc, PgDip or PgCert",Aberdeen University,Are you interested in improving health outcome...,https://www.findamasters.com/masters-degrees/c...
2034,Clinical Oncology (Full time) - MSc/PGDip,University of Birmingham,For health care professionals from diverse bac...,https://www.findamasters.com/masters-degrees/c...
2035,Clinical Oncology (Part time) - MSc/PGDip,University of Birmingham,A programme for health care professionals from...,https://www.findamasters.com/masters-degrees/c...
2043,Clinical Pharmacology MSc,Aberdeen University,The University of Aberdeen is highly regarded ...,https://www.findamasters.com/masters-degrees/c...


In [None]:
#Convert a collection of raw documents to a matrix of TF-Idataset features
tfidataset = TfidfVectorizer(input='content', lowercase=False, tokenizer=lambda text: text)
results = tfidataset.fit_transform(dataset.description_clean) # fit data to train our model (but in our case is the same dataset)
results_dense = results.todense() # results are sparse documents that i want to convert into a dense one

# putting all into a dataframe where the index of the dataframe is each document id
tfidataset_data = pd.DataFrame(results_dense.tolist(), index=dataset.index, columns=tfidataset.get_feature_names_out())

In [None]:
create_second_inverted_index(inv_indx, vocabulary, tfidataset_data, feat='description')
second_query_vec = create_vector_query(query_for_new_score, vocabulary, tfidataset_data)
heap_second_query = []
scores_dictionary_second_query = {}
#compute_cosine_similarity(heap_second_query, scores_dictionary_second_query)


# For every document
for doc_index in range(tfidataset_data.shape[0]):
    if doc_index in tfidataset_data.index:
        doc_arr = tfidataset_data.loc[doc_index, :].values
        # Compute the angle between the doc and the query vector
        cos_sim = a_cosine_similarity(second_query_vec, doc_arr)

        # Put the result in the dictionary
        scores_dictionary_second_query[doc_index] = cos_sim
        # Update the heap
        heapq.heappush(heap_second_query, (cos_sim, doc_index))  # Store both score and document index in the heap
    else:
        continue

In [None]:
k = 20
top_k, top_doc_k = execute_query(k, heap_second_query)

[(0.20647963214052836, 4462), (0.0, 5976), (0.0, 5975), (0.0, 5974), (0.0, 5973), (0.0, 5972), (0.0, 5971), (0.0, 5970), (0.0, 5969), (0.0, 5968), (0.0, 5967), (0.0, 5966), (0.0, 5965), (0.0, 5964), (0.0, 5963), (0.0, 5962), (0.0, 5961), (0.0, 5960), (0.0, 5959), (0.0, 5958)]
[4462, 5976, 5975, 5974, 5973, 5972, 5971, 5970, 5969, 5968, 5967, 5966, 5965, 5964, 5963, 5962, 5961, 5960, 5959, 5958]


In [None]:
# adding the column 'similarity score' to the dataset
rinformation_needed = ['courseName','universityName','description', 'url']
results = dataset.loc[top_doc_k , information_needed]
results['similarity'] = [round(s[0],3) for s in top_k]
results

Unnamed: 0,courseName,universityName,description,url,similarity
4462,Improvement Science - MSc,University of West London,Do you work in the health sector? Is there an ...,https://www.findamasters.com/masters-degrees/c...,0.206
5976,Masters's in Digital Politics and Governance,European School of Political and Social Scienc...,Digitalisation is a critical issue in today’s ...,https://www.findamasters.com/masters-degrees/c...,0.0
5975,"Masters Program in Climate Change, Agriculture...",University of Galway,The world’s climate is rapidly changing due to...,https://www.findamasters.com/masters-degrees/c...,0.0
5974,"Masters of Science in Business, Supply Chain A...",Oregon State University,Master of Science in Business (MSB)Our Master ...,https://www.findamasters.com/masters-degrees/c...,0.0
5973,Masters of Science in Business,Oregon State University,Our Master of Science in Business (MSB) will g...,https://www.findamasters.com/masters-degrees/c...,0.0
5972,Master's of Front-end Development,Harbour.Space University,Front-end Development at Harbour.Space Univers...,https://www.findamasters.com/masters-degrees/c...,0.0
5971,Master's of Financial Technology (Fintech),Harbour.Space University,Harbour.Space's FinTech Master programme is de...,https://www.findamasters.com/masters-degrees/c...,0.0
5970,Masters Of Finance (International Finance),Zhejiang Gongshan University,Master’s in Finance (International Finance) at...,https://www.findamasters.com/masters-degrees/c...,0.0
5969,Masters of Finance,University of Hong Kong,The HKU Business School Master of Finance (MFi...,https://www.findamasters.com/masters-degrees/c...,0.0
5968,Master's of Data Science,Harbour.Space University,Harbour.Space’s Master of Data Science prepare...,https://www.findamasters.com/masters-degrees/c...,0.0


# 3.2

Heap Data Structure Operations:

1. **Heapify:**
   Heapify is the process of transforming an array into a heap. This involves arranging the elements in a way that satisfies the heap property.

2. **Insertion:**
   The insertion operation involves adding an element to an existing heap. The time complexity of this operation is O(log N), where N is the number of elements in the heap. This is because the element may need to be moved up the heap to maintain the heap property.

3. **Deletion:**
   Deletion in a heap typically refers to removing the top element (root) of the heap or the element with the highest priority. After removal, the heap is reorganized to maintain the heap property. The time complexity of this operation is O(log N) because the reorganization involves adjusting the heap structure.

4. **Peek:**
   Peek operation is used to inspect the top element of the heap without removing it. It allows checking or finding the element with the highest priority. The time complexity is constant, O(1).

Types of Heap Data Structures:

Heaps can be broadly classified into two types:

1. **Max-Heap:**
   In a Max-Heap, the key at the root node is the greatest among the keys in all of its children. This property must be true recursively for all sub-trees in the binary tree.

2. **Min-Heap:**
   In a Min-Heap, the key at the root node is the minimum among the keys in all of its children. Similar to the Max-Heap, this property must be recursively true for all sub-trees in the binary tree.

In [None]:
import heapq

def find_top_k_doc(k, dataset_dummy):
    top_documents_heap = []  # List for the max heap

    # Add the documents to the heap
    for index, row in dataset_dummy.iterrows():
        document_score = row['final_score']
        document_info = (-document_score, index)  # Use negation for max heap
        if len(top_documents_heap) < k:
            heapq.heappush(top_documents_heap, document_info)
        else:
            # If the heap is already of size k, compare the score of the current document with the maximum score in the heap
            max_score, max_index = -top_documents_heap[0][0], top_documents_heap[0][1]
            if document_score > max_score:
                heapq.heappop(top_documents_heap)
                heapq.heappush(top_documents_heap, document_info)

    # Indices of the top-k documents from the heap
    top_documents_indices = [index for _, index in top_documents_heap]

    # Top-k documents from the DataFrame
    top_documents = dataset_dummy.loc[top_documents_indices]
    return top_documents


In [None]:
def compute_new_score(dataset_dummy):

  # Add the weighted_modality column
  modality_binary = dataset['modality'].str.get_dummies(', ') #Dataset of binary values for each value of the modality
  total_columns_modality = modality_binary.sum(axis=1) # Total per row of the ones
  weighted_total_modality = (
      0.10 * modality_binary['MSc'] +    #Weight 'MSc' more
      0.025 * modality_binary['Other'] +  # Weight 'Other' less
      (1 - 0.10 - 0.025) * total_columns_modality  # Weight of other columns
  )
  dataset_dummy['weighted_modality'] = weighted_total_modality / total_columns_modality # Add the column to the dataset

  # Add the weighted_administration column
  administration_binary = dataset['administration'].str.get_dummies(', ')
  total_columns_administration = administration_binary.sum(axis=1)
  dataset_dummy['weighted_administration'] = total_columns_administration / administration_binary.shape[1]

  # Add the weighted_startDate column
  startDate_binary = dataset['startDate'].str.get_dummies(', ')
  startDate_binary['total_columns_startDate'] = startDate_binary.sum(axis=1)
  startDate_binary['weighted_total_startDate'] = (
      0 * startDate_binary['See Course'] +  # Weight 'See Course' as 0
      startDate_binary['total_columns_startDate']  # Weight other columns
    )
  startDate_binary.weighted_startDate = startDate_binary['weighted_total_startDate']  / startDate_binary['total_columns_startDate']
  # Consider 'weighted_startDate' = 1 where in  dataset.startDate there is 'Any Month'
  startDate_binary.loc[startDate_binary['Any Month'] == 1, 'weighted_startDate'] = 1
  dataset_dummy['weighted_startDate'] = startDate_binary.weighted_startDate


  selected_columns = ['courseName', 'universityName', 'description', 'url', 'weighted_modality', 'weighted_administration', 'weighted_startDate',  'similarity']
  dataset_dummy = dataset_dummy[selected_columns]
  dataset_dummy['final_score'] = dataset_dummy[['weighted_modality', 'weighted_administration', 'weighted_startDate', 'similarity']].mean(axis=1)

  selected_columns = ['courseName', 'universityName', 'description', 'url', 'weighted_modality', 'weighted_administration', 'weighted_startDate', 'similarity']
  dataset_dummy = dataset_dummy[selected_columns]
  dataset_dummy['final_score'] = dataset_dummy[['weighted_modality', 'weighted_administration', 'weighted_startDate', 'similarity']].mean(axis=1)

  # Find the top-k documents
  print(f'There are {len(dataset_dummy)} documents')
  k = int(input("How many documents would you like to see?"))
  if k <= len(dataset_dummy):
    return find_top_k_doc(k, dataset_dummy)
  else:
    print("You asked for too many documents, here are all of them")
    return find_top_k_doc(len(dataset_dummy), dataset_dummy)



In [None]:
new_score_results= compute_new_score(results)

  startDate_binary.weighted_startDate = startDate_binary['weighted_total_startDate']  / startDate_binary['total_columns_startDate']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_dummy['final_score'] = dataset_dummy[['weighted_modality', 'weighted_administration', 'weighted_startDate', 'similarity']].mean(axis=1)


There are 20 documents


In [None]:
new_score_results

Unnamed: 0,courseName,universityName,description,url,weighted_modality,weighted_administration,weighted_startDate,similarity,final_score
4462,Improvement Science - MSc,University of West London,Do you work in the health sector? Is there an ...,https://www.findamasters.com/masters-degrees/c...,0.975,1.0,1.0,0.206,0.79525
5968,Master's of Data Science,Harbour.Space University,Harbour.Space’s Master of Data Science prepare...,https://www.findamasters.com/masters-degrees/c...,0.975,1.0,1.0,0.0,0.74375
5972,Master's of Front-end Development,Harbour.Space University,Front-end Development at Harbour.Space Univers...,https://www.findamasters.com/masters-degrees/c...,0.975,1.0,1.0,0.0,0.74375
5976,Masters's in Digital Politics and Governance,European School of Political and Social Scienc...,Digitalisation is a critical issue in today’s ...,https://www.findamasters.com/masters-degrees/c...,0.975,1.0,1.0,0.0,0.74375
5969,Masters of Finance,University of Hong Kong,The HKU Business School Master of Finance (MFi...,https://www.findamasters.com/masters-degrees/c...,0.975,1.0,1.0,0.0,0.74375
5975,"Masters Program in Climate Change, Agriculture...",University of Galway,The world’s climate is rapidly changing due to...,https://www.findamasters.com/masters-degrees/c...,0.975,1.0,1.0,0.0,0.74375
5971,Master's of Financial Technology (Fintech),Harbour.Space University,Harbour.Space's FinTech Master programme is de...,https://www.findamasters.com/masters-degrees/c...,0.925,1.0,1.0,0.0,0.73125
5974,"Masters of Science in Business, Supply Chain A...",Oregon State University,Master of Science in Business (MSB)Our Master ...,https://www.findamasters.com/masters-degrees/c...,0.975,0.0,1.0,0.0,0.49375
5970,Masters Of Finance (International Finance),Zhejiang Gongshan University,Master’s in Finance (International Finance) at...,https://www.findamasters.com/masters-degrees/c...,0.925,1.0,1.0,0.0,0.73125
5973,Masters of Science in Business,Oregon State University,Our Master of Science in Business (MSB) will g...,https://www.findamasters.com/masters-degrees/c...,0.925,0.0,1.0,0.0,0.48125


# 4. Visualizing the most relevant MSc degrees

From the dataset of the courese with the score found in the point 3 of the Homework, we create another dataset called 'geo' that contains the necessary columns to create the map of the masters.

In [None]:
#Creating the dataset with the k=6 most relevant MSc degrees
df3 = dataset[['courseName', 'universityName', 'facultyName', 'city', 'country', 'fees (EUR)']]
df3_score = new_score_results[['courseName']]

geo = pd.merge(df3_score, df3, on=['courseName'], how='inner')
geo = geo[geo['fees (EUR)'] != 'EUR']
geo = geo[geo['fees (EUR)'].notna()]

print(geo)

                        courseName            universityName  \
11  Masters of Science in Business   Oregon State University   
14    Master's of Computer Science  Harbour.Space University   

           facultyName       city country  fees (EUR)  
11  School of Business  Corvallis     USA  419.493908  
14  Masters Programmes  Barcelona   Spain   17.806935  


Now through the library geopy we find the corrispondent coordinates (latitude and longitude) and add them to the dataset geo.\
The function *get_ccordinates* is defined in the *defs* file

In [None]:
#Finding the coordinates

geo.loc[:, 'latitude'], geo.loc[:, 'longitude'] = zip(*geo.apply(lambda row: get_coordinates(row['universityName'], row['city'], row['country']), axis=1))
#Printing the dataset
print(geo)

                        courseName            universityName  \
11  Masters of Science in Business   Oregon State University   
14    Master's of Computer Science  Harbour.Space University   

           facultyName       city country  fees (EUR)   latitude   longitude  
11  School of Business  Corvallis     USA  419.493908  44.563056 -123.283924  
14  Masters Programmes  Barcelona   Spain   17.806935  41.382894    2.177432  


Finally we create the map with a color legend based on the fees, showwing the map about the courses and the associated taxation.

In [None]:
import folium
from folium.plugins import MarkerCluster
from folium.features import CustomIcon
from branca.colormap import LinearColormap

#Defining the tuition fee classes
fee_classes = [2500, 5000, 7500, 10000, 12500, 15000, 17500, 20000, 22500, 25000] #10 classes of tuition fees
fee_labels = [f"{fee:,}-{fee_next:,}" for fee, fee_next in zip(fee_classes, fee_classes[1:])]
fee_labels[-1] = f"{fee_classes[-1]:,}+"

#Creating a Folium map
m = folium.Map(location=[geo['latitude'].mean(), geo['longitude'].mean()], zoom_start=2)

#Creating a color scale for tuition fees
colormap = LinearColormap(colors=['blue', 'cyan', 'green', 'yellow', 'orange', 'red', 'purple', 'pink', 'brown', 'gray'], index=fee_classes, vmin=fee_classes[0], vmax=fee_classes[-1])

#Adding the legend to the map
colormap.caption = 'Tuition Fees (EUR)'
m.add_child(colormap)

#Creating a MarkerCluster to group the markers
marker_cluster = MarkerCluster().add_to(m)

#Adding markers to the map for tuition fees
for index, row in geo.iterrows():
    fee_color = colormap(row['fees (EUR)'])

    #Selecting an icon based on the fee class
    icon = folium.Icon(color='white', icon_color=fee_color, icon='fa-location-dot', prefix='fa')

    #Adding a marker to the MarkerCluster with a custom icon
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        icon=icon,
        popup=f"{row['universityName']} - Course: {row['courseName']} - Fees: {int(row['fees (EUR)']):,}",
        tooltip=f"{row['universityName']} - {row['courseName']} - {float(row['fees (EUR)'])}"
    ).add_to(marker_cluster)

#Showing the map
m

# 5. Bonus: More complex search engine

In [None]:
import json

In [None]:
user_request = json.load(open("prova 5.json")) # read the file where the user specified the requests

First filter the dataset with those information the user gave us, such as the fees range, the countries and the presence of online modality.

In [None]:
dataset = dataset[dataset['fees (EUR)'].notna()]
dataset = dataset[dataset['fees (EUR)']!= 'EUR']

In [None]:
dataset = dataset[dataset['country'].notna()]

In [None]:
# filtering based on the fees range
dataset_filtered = dataset[(dataset['fees (EUR)'] >= user_request['Fees Range']['lower import']) &
                           (dataset['fees (EUR)'] <= user_request['Fees Range']['upper import'])]

In [None]:
# filtering on the country list
df_f = []
for c in user_request["list of countries"]:
    df = dataset_filtered[dataset_filtered['country'] == c ]
    df_f.append(df)
df_f = pd.concat(df_f, ignore_index = True)


In [None]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
current_month = 11
months_to_keep = months[current_month-4:current_month]

# Function to check if any of the months of the starting date are present in the start date
def filter_months(row):
    start_dates = row.split(', ')
    return any(month in start_dates for month in months_to_keep)

# Applying the filter
df_filtered= df_f[df_f['startDate'].apply(filter_months)]

In [None]:
# filtering on the presence of the online modality
df_filtered = df_filtered[df_filtered['administration'] != 'On Campus']

Now that we have filtered the courses that respect the fees range, the countries, the start date (and the online modality??) we are going to estimate the similarity score between the queries on the features expresses by th euser and each of the filtered master.

In [None]:
# checking in the json file all the fields requested by the user to perform the query on
query = {}

if user_request['query on features']['courseName'] != "":
    query['courseName'] = user_request['query on features']['courseName']

if user_request['query on features']['universityName'] != "":
    query['universityName'] = user_request['query on features']['universityName']

if user_request['query on features']['universityCity'] != "":
    query['city'] = user_request['query on features']['universityCity']

print(query)

{'courseName': 'Healthcare management and leadership', 'universityName': 'Worcester', 'city': 'Worcester'}


Creating the inverted index of all the fields of the query

In [None]:
# creating the inverted index of all the fields requested by the user

for field in query.keys():
    df_filtered = df_filtered.copy()
    # performing on that fields the 3 steps of the preprocess
    # 1. stemming
    name = f'clean_{field}'
    df_filtered[name] = df_filtered[field].apply(stem_description)
    # 2. removing stopwords
    df_filtered[name] = df_filtered[name].apply(remove_stopwords)
    # 3. removing punctuation
    df_filtered[name] = df_filtered[name].apply(remove_punctuation)

    # creating the vocabulary of the fields
    vocabulary = Counter(reduce(lambda x,y : x+y, df_filtered[name])).keys()
    # assign an unique ID to each word of the vocabulary using a pandas dataframe
    terms = pd.DataFrame(data=list(vocabulary), columns=['term'])
    # creating a csv file for the vocabulary with index of each term
    terms.to_csv(f'vocabulary_{field}.csv', index_label='term_id')

    # create the inverse index for each term in each field
    terms['reverse'] = terms.term.apply(lambda item: list(df_filtered.loc[df_filtered[name].apply(lambda row: item in row)].index))

    # save the inverted index in a file
    InvertedIndex = terms['reverse'].to_dict()
    # store the inverted index in a txt file
    with open(f'Inverted Index {field}.txt', 'w') as file:
        for key, value in InvertedIndex.items():
            file.write(f'{key}: {value}\n')
    file.close()

Preprocessing the query for each field

In [None]:
# preprocess the query
preproc_query = {}
for field in query.keys():
    preproc_query[field] = query_preprocess(query[field])

print(preproc_query)

{'courseName': ['healthcar', 'manag', 'leadership'], 'universityName': ['worcest'], 'city': ['worcest']}


Evaluating the *tfidf* scores for each field of the query and creating the extended inverted index for each of the field.

In [None]:
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

# if the user specified the course name
if query['courseName']:

#Convert a collection of raw documents to a matrix of TF-Idataset features

    tfidataset = TfidfVectorizer(input='content', lowercase=False, tokenizer=lambda text: text)
    results = tfidataset.fit_transform(df_filtered.clean_courseName) # fit data to train our model (but in our case is the same dataset)
    results_dense = results.todense() # results are sparse documents that i want to convert into a dense one

    # putting all into a dataframe where the index of the dataframe is each document id
    tfidataset_data_courseName = pd.DataFrame(results_dense.tolist(), index=df_filtered.index, columns=tfidataset.get_feature_names_out())

    vocabulary_courseName = pd.read_csv('vocabulary_courseName.csv') # read the vocabulary file into a dataframe
    vocabulary_courseName = pd.DataFrame(vocabulary_courseName)
    file = open("Inverted Index courseName.txt", "r") # read the inverted index from the file.
    inv_indx_courseName = dict()
    txt = file.read().split("\n")

    for i in range(len(txt)-1):
        line = txt[i].replace(":", "").replace("[", "").replace("]", "").replace("(", "").replace(")", "").replace(",", "").split(" ")
        inv_indx_courseName[int(line[0])] = []
        for j in range(1, len(line)):
                inv_indx_courseName[int(line[0])].append(int(line[j]))
    file.close()

    # creating the second inverted index
    create_second_inverted_index(inv_indx=inv_indx_courseName, vocabulary=vocabulary_courseName, tfidataset_data=tfidataset_data_courseName, feat = 'courseName')

In [None]:
# if the user specified the university name

if query['universityName']:

    #Convert a collection of raw documents to a matrix of TF-Idataset features

    tfidataset = TfidfVectorizer(input='content', lowercase=False, tokenizer=lambda text: text)
    results = tfidataset.fit_transform(df_filtered.clean_universityName) # fit data to train our model (but in our case is the same dataset)
    results_dense = results.todense() # results are sparse documents that i want to convert into a dense one

    # putting all into a dataframe where the index of the dataframe is each document id
    tfidataset_data_universityName = pd.DataFrame(results_dense.tolist(), index=df_filtered.index, columns=tfidataset.get_feature_names_out())

    vocabulary_universityName = pd.read_csv('vocabulary_universityName.csv') # read the vocabulary file into a dataframe
    vocabulary_universityName = pd.DataFrame(vocabulary_universityName)
    file = open("Inverted Index universityName.txt", "r") # read the inverted index from the file.
    inv_indx_universityName = dict()
    txt = file.read().split("\n")

    for i in range(len(txt)-1):
        line = txt[i].replace(":", "").replace("[", "").replace("]", "").replace("(", "").replace(")", "").replace(",", "").split(" ")
        inv_indx_universityName[int(line[0])] = []
        for j in range(1, len(line)):
                inv_indx_universityName[int(line[0])].append(int(line[j]))
    file.close()

        # creating the second inverted index
    create_second_inverted_index(inv_indx= inv_indx_universityName, vocabulary=vocabulary_universityName, tfidataset_data=tfidataset_data_universityName, feat = 'universityName')



In [None]:
# if the user specified the university city

if query['city']:

    #Convert a collection of raw documents to a matrix of TF-Idataset features

    tfidataset = TfidfVectorizer(input='content', lowercase=False, tokenizer=lambda text: text)
    results = tfidataset.fit_transform(df_filtered.clean_city) # fit data to train our model (but in our case is the same dataset)
    results_dense = results.todense() # results are sparse documents that i want to convert into a dense one

    # putting all into a dataframe where the index of the dataframe is each document id
    tfidataset_data_city = pd.DataFrame(results_dense.tolist(), index=df_filtered.index, columns=tfidataset.get_feature_names_out())

    vocabulary_city = pd.read_csv('vocabulary_city.csv') # read the vocabulary file into a dataframe
    vocabulary_city = pd.DataFrame(vocabulary_city)
    file = open("Inverted Index city.txt", "r") # read the inverted index from the file.
    inv_indx_city = dict()
    txt = file.read().split("\n")

    for i in range(len(txt)-1):
        line = txt[i].replace(":", "").replace("[", "").replace("]", "").replace("(", "").replace(")", "").replace(",", "").split(" ")
        inv_indx_city[int(line[0])] = []
        for j in range(1, len(line)):
                inv_indx_city[int(line[0])].append(int(line[j]))
    file.close()

        # creating the second inverted index
    create_second_inverted_index( inv_indx= inv_indx_city, vocabulary= vocabulary_city, tfidataset_data=tfidataset_data_city, feat = 'city')

In [None]:
# read the inverted index from the file for the course name, if any

if query['courseName']:
    ext_inv_indx_courseName = read_inverted_index('courseName')

In [None]:
# read the inverted index from the file for the university name, if any

if query['universityName']:
    ext_inv_indx_universityName = read_inverted_index('universityName')

In [None]:
# read the inverted index from the file for the city name, if any

if query['city']:
    ext_inv_indx_city = read_inverted_index('city')

Now we execute the query

In [None]:
# create a vector for the query
# one vector for each part of the query

if query['courseName']:
    query_vec_courseName = create_vector_query(query = preproc_query['courseName'], vocabulary = vocabulary_courseName, tfidataset_data= tfidataset_data_courseName)


if query['universityName']:
    query_vec_univerityName = create_vector_query(query = preproc_query['universityName'], vocabulary = vocabulary_universityName, tfidataset_data= tfidataset_data_universityName)


if query['city']:
    query_vec_city = create_vector_query(query = preproc_query['city'], vocabulary = vocabulary_city, tfidataset_data= tfidataset_data_city)

# the documents matrix with all the tfidataset is the dataframe tfidataset of the field

Now we evaluate the similarity for each field of the query, obtaining as a result up to 3 dataframes (depending on the fields the user specified for the query); each resulting dataframe will contain the masters that are more similar to the query according the cosine similarity.

In [None]:
k = 5
if query['courseName']:
    heap_courseName = []
    scores_dictionary_courseName = {}

    # For every document
    for doc_index in range(1, tfidataset_data_courseName.shape[0]):
        if doc_index in tfidataset_data_courseName.index:
            doc_arr = tfidataset_data_courseName.loc[doc_index, :].values
            # Compute the angle between the doc and the query vector
            cos_sim = a_cosine_similarity(query_vec_courseName, doc_arr)

            # Put the result in the dictionary
            scores_dictionary_courseName[doc_index] = cos_sim

            # Update the heap
            heapq.heappush(heap_courseName, (cos_sim, doc_index))  # Store both score and document index in the heap
        else:
            continue

    # execute the query
    top_k_courseName = heapq.nlargest(k, heap_courseName)

    top_doc_k_courseName= []

    # fill the list of top_k_doc
    for score, doc in top_k_courseName:
        top_doc_k_courseName.append(doc)

    # adding the column 'similarity score' to the dataset
    information_needed = ['courseName','universityName','description','city', 'url']
    results_courseName = df_filtered.loc[top_doc_k_courseName , information_needed]
    results_courseName['similarity'] = [round(s[0],3) for s in top_k_courseName]


if query['universityName']:
    heap_universityName = []
    scores_dictionary_universityName = {}

    # For every document
    for doc_index in range(1, tfidataset_data_universityName.shape[0]):
        if doc_index in tfidataset_data_universityName.index:
            doc_arr = tfidataset_data_universityName.loc[doc_index, :].values
            # Compute the angle between the doc and the query vector
            cos_sim = a_cosine_similarity(query_vec_univerityName, doc_arr)

            # Put the result in the dictionary
            scores_dictionary_universityName[doc_index] = cos_sim

            # Update the heap
            heapq.heappush(heap_universityName, (cos_sim, doc_index))  # Store both score and document index in the heap
        else:
            continue

    # execute the query
    top_k_universitName = heapq.nlargest(k, heap_universityName)

    top_doc_k_universityName= []

    # fill the list of top_k_doc
    for score, doc in top_k_universitName:
        top_doc_k_universityName.append(doc)

    # adding the column 'similarity score' to the dataset
    information_needed = ['courseName','universityName','description','city', 'url']
    results_universityName = df_filtered.loc[top_doc_k_universityName , information_needed]
    results_universityName['similarity'] = [round(s[0],3) for s in top_k_universitName]


if query['city']:
    heap_city = []
    scores_dictionary_city = {}

    # For every document
    for doc_index in range(1, tfidataset_data_city.shape[0]):
        if doc_index in tfidataset_data_city.index:
            doc_arr = tfidataset_data_city.loc[doc_index, :].values
            # Compute the angle between the doc and the query vector
            cos_sim = a_cosine_similarity(query_vec_city, doc_arr)

            # Put the result in the dictionary
            scores_dictionary_city[doc_index] = cos_sim

            # Update the heap
            heapq.heappush(heap_city, (cos_sim, doc_index))  # Store both score and document index in the heap
        else:
            continue

    # execute the query
    top_k_city = heapq.nlargest(k, heap_city)

    top_doc_k_city= []

    # fill the list of top_k_doc
    for score, doc in top_k_city:
        top_doc_k_city.append(doc)

    # adding the column 'similarity score' to the dataset
    information_needed = ['courseName','universityName','description','city', 'url']
    results_city = df_filtered.loc[top_doc_k_city , information_needed]
    results_city['similarity'] = [round(s[0],3) for s in top_k_city]

In [None]:
results_courseName

Unnamed: 0,courseName,universityName,description,city,url,similarity
17,MSc Nursing,University of Essex Online,Start Date: SeptemberThe demand for skilled nu...,Colchester,https://www.findamasters.com/masters-degrees/c...,0.675
10,MSc Data Science,University of Essex Online,Start Date: OctoberUse the power of data to ma...,Colchester,https://www.findamasters.com/masters-degrees/c...,0.517
19,MSc Organisational Psychology,University of Essex Online,Start Date: OctoberMSc Organisational Psycholo...,Colchester,https://www.findamasters.com/masters-degrees/c...,0.0
16,MSc International Human Resource Management,University of Essex Online,Start Date: OctoberIn a world with an increasi...,Colchester,https://www.findamasters.com/masters-degrees/c...,0.0
15,MSc International Healthcare Management,University of Essex Online,Start Date: SeptemberHealthcare is a subject a...,Colchester,https://www.findamasters.com/masters-degrees/c...,0.0


Finally we have to aggregate our results unifying the datasets and summing the similarity score of each master if it appears in more than one result.\
The result of the query will be only those masters that have a similarity grater than 0.0

In [None]:
# sum of the scores

grouped_res = pd.concat([results_courseName, results_universityName, results_city])
result = grouped_res.groupby(grouped_res.index).agg({'similarity':'sum'})
result = result[result.similarity > 0.0]
a = result.to_dict() # the dictionary of the indexes of those masters that have summed similarity > 0.0

df_filtered['similarity'] = 0.0  # Initialize a new column with None

for ind in a['similarity'].keys():
    df_filtered.loc[ind,'similarity'] = a['similarity'][ind]

result = df_filtered.drop(['descr_stem','description_clean', 'clean_courseName', 'clean_universityName', 'clean_city'], axis = 1)
result = result[result['similarity']>0.0]
result

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,modality,duration,fees (EUR),country,city,administration,url,similarity
10,MSc Data Science,University of Essex Online,Online Masters Degree Programmes,Part Time,Start Date: OctoberUse the power of data to ma...,"October, January",MSc,2 Years Part Time,13580.402731,United Kingdom,Colchester,,https://www.findamasters.com/masters-degrees/c...,0.517
17,MSc Nursing,University of Essex Online,Online Masters Degree Programmes,Part Time,Start Date: SeptemberThe demand for skilled nu...,"September, January",MSc,2 years,13580.402731,United Kingdom,Colchester,,https://www.findamasters.com/masters-degrees/c...,0.675


# 6. Command Line Question

In the Command Line script, instructions have been included for creating the merged_courses file and for responding to the proposed questions. The results obtained align with our expectations. Look at the CommandLine.sh file in the github repository.

# 7. Algorithmic Question

In [1]:
import sys

# Read D and x from the console
D, x = map(int, input().split())

# Initialize the sums of minis and maxis
sum_mini = 0
sum_maxi = 0

minis = []
maxis = []

# Read the intervals and calculate the sums
for _ in range(D):
    mini, maxi = map(int, input().split())
    sum_mini += mini
    sum_maxi += maxi
    minis.append(mini)
    maxis.append(maxi)

# Check if x belongs to the [sum of minis, sum of maxis] interval and if x matches the sum of minis or maxis
if sum_mini <= x <= sum_maxi:
    if x == sum_mini:
        if any(val == 0 for val in minis):
            print("NO")
            sys.exit(0)
        print("YES")
        print(" ".join(map(str, minis)))
        sys.exit(0)
    elif x == sum_maxi:
        if any(val == 0 for val in maxis):
            print("NO")
            sys.exit(0)
        print("YES")
        print(" ".join(map(str, maxis)))
        sys.exit(0)
    print("YES")
else:
    print("NO")
    sys.exit(0)

# Use for loops to generate tuples
# The tuples variable will contain all the D-tuples
tuples = [()]

for i in range(D):
    new_tuples = []
    for tuple_str in tuples:
        for val in range(minis[i], maxis[i] + 1):
            new_tuples.append(tuple_str + (val,))
    tuples = new_tuples

# Print the generated tuples
for tuple_str in tuples:
    exclude_tuple = any(val == 0 for val in tuple_str)

    if not exclude_tuple:
        current_sum = sum(tuple_str)

        if x == current_sum:
            print(" ".join(map(str, tuple_str)).lstrip())

2 5
0 1
3 5
YES
1 4


In the previous code, all tuples containing one or more zeros have been excluded because the report needs to indicate the hours worked for each day and we assumed that 0 was not an acceptable answer.

We analyze the time complexity of this script:

*   *Reading intervals*: Requires O(D) operations;
*   *Tuple generation*: For each interval, a for loop is executed with up to (maxi - mini + 1) iterations. This loop is repeated for each interval, so the total complexity of this part is O((maxi - mini + 1)^D). If we consider that (maxi - mini) is somehow bounded by a constant, we can approximate this complexity to O(M^D), where M represents the maximum range among all intervals.;
*   *Filtering tuples containing 0, calculating the sum, and comparing with x*: This requires O(M^D) operations in the worst case.

Therefore, the total complexity is O(M^D). This complexity can become prohibitive for high values of D, but it can be manageable if the number of intervals and the width of the intervals are relatively small.

ChatGPT analysis:

The time complexity of the provided code depends mainly on the part where tuples are generated, as that is the most costly operation.

The tuple generation part has a time complexity of O((maxis[0] - minis[0] + 1) * (maxis[1] - minis[1] + 1) * ... * (maxis[D-1] - minis[D-1] + 1)), where D is the number of iterations (number of tuples). Therefore, the time complexity of this part of the code is exponential with respect to D.

The rest of the code, in terms of sums and comparisons, has a linear time complexity in relation to D.

Thus, the overall time complexity of the code is dominated by the tuple generation and can be approximated as O((maxis[0] - minis[0] + 1) * (maxis[1] - minis[1] + 1) * ... * (maxis[D-1] - minis[D-1] + 1)).

This exponential complexity might make the program inefficient for significantly large values of D.



So, the two analyses arrive at the same conclusions but with different notations.

We submitted the previous code to ChatGPT to improve its efficiency, and it suggests replacing the for loops with itertools.product to generate all tuples more efficiently.

Optimized Code:

In [3]:
import sys
from itertools import product

# Read D and x from the console
D, x = map(int, input().split())

# Initialize the sums of minis and maxis
sum_mini = 0
sum_maxi = 0

minis = []
maxis = []

# Read the intervals and calculate the sums
for _ in range(D):
    mini, maxi = map(int, input().split())
    sum_mini += mini
    sum_maxi += maxi
    minis.append(mini)
    maxis.append(maxi)

# Check if x belongs to the [sum of minis, sum of maxis] interval and if x matches the sum of minis or maxis
if sum_mini <= x <= sum_maxi:
    if x == sum_mini:
        if any(val == 0 for val in minis):
            print("NO")
            sys.exit(0)
        print("YES")
        print(" ".join(map(str, minis)))
        sys.exit(0)
    elif x == sum_maxi:
        if any(val == 0 for val in maxis):
            print("NO")
            sys.exit(0)
        print("YES")
        print(" ".join(map(str, maxis)))
        sys.exit(0)
    print("YES")
else:
    print("NO")
    sys.exit(0)

# Use itertools.product to generate tuples
tuples = product(*(range(mini, maxi + 1) for mini, maxi in zip(minis, maxis)))

# Print the generated tuples
for tuple_str in tuples:
    exclude_tuple = any(val == 0 for val in tuple_str)

    if not exclude_tuple:
        current_sum = sum(tuple_str)

        if x == current_sum:
            print(" ".join(map(str, tuple_str)).lstrip())

2 5
0 1
3 5
YES
1 4
