# 1. Data collection

### 1.1 Get the list of master's degree courses

In [6]:
from bs4 import BeautifulSoup
import os
import pandas as pd
import time
import random
import glob
from nltk.stem import *
from nltk.corpus import stopwords
from collections import Counter
from functools import reduce
import heapq

In [15]:
from defs import *

Via the HTTP GET request we retrieve the content of the url of our interest, in our case the page that contains all the masters.

In [4]:
url = 'https://www.findamasters.com/masters-degrees/msc-degrees/'
result = requests.get(url)

To extract all the links of the master's degree of the first 400 pages we used a function, which is located in the *defs.py* file; then we store all the links in the file *masters_urls.txt*.

In [None]:
# to extract all the masters we have to do a for loop for the first 400 pages:

num_pages = 400
pref = 'https://www.findamasters.com'
test_lst_all = []

for i in range(1, num_pages + 1):
    test_lst = extract_masters(pref + '/masters-degrees/msc-degrees/?PG=' + str(i))
    test_lst_all.extend(test_lst)
# creating the txt file of the fisrt 400 pages of ms
with open('masters_urls.txt', 'w') as f:
    for item in test_lst_all:
        f.write(item[0] + '\n')
# file created 

### 1.2 Crawl master's degree pages

We are going to save each HTML page of each course in a different folder, one folder for each page it is in; so we'll obtain 400 folders, each of them will contain 15 HTML files. The urls are taken from the *masters_urls.txt* previusly created.


In [None]:
# settings for the User-Agent to simulate a browser
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
folder_name = "html_pages" # create the folder that will contain the html pages
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

for i in range(1, 401):
    # create a folder for each page, from 1 to 400
    name = 'HTML page ' + str(i)
    path_folder = os.path.join(folder_name, name)
    if not os.path.exists(path_folder):
        os.makedirs(path_folder)

# open the file containing the urls
with open('masters_urls.txt', 'r') as file:
    for index, url in enumerate(file):
        url = url.strip()
        page = (index // 15 ) + 1
        try:
            # the complete url
            full_url = "https://www.findamasters.com" + url.strip()

            # add a delay of 1 to 5 seconds between the requests
            time.sleep(1 + random.uniform(0, 4))
            # request to obtain the content of the url
            response = requests.get(full_url, headers=headers)
            
            if response.status_code == 200:
                # Parsing dell'HTML con BeautifulSoup
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # save the html of the course in a separate file in the folder of the page it belongs to 
                file_path = os.path.join(f"{folder_name}\HTML page {page}", f"course {index+1}.html")
                with open(file_path, "w", encoding="utf-8") as html_file:
                    html_file.write(str(soup))

            else:
                print(f"Errore nel recuperare la pagina del corso: {full_url}")
        
        except Exception as e:
            print(f"Errore durante il recupero e salvataggio della pagina {full_url}: {e}")


### 1.3 Parse download pages

Through the *extcat_msc_page* function (located in the *defs.py*) we parse all the HTML we retrieved before and collect all the information for each master.

In [None]:
#Directory where there are the HTML repositories
html_folder = "\html-pages"
my_path = "D:\Primo Semestre\ADM\HW3"
#List to contain all the information
all_master_info = []

all_url = []

#Iterating whitin the repositories HTML of every page
for page_folder in os.listdir(my_path + html_folder):
    page_path = os.path.join(html_folder, page_folder)
    file_absolute_path = os.path.join(my_path + page_path)

    if os.path.isdir(file_absolute_path):
        #Iterating in the files HTML of every repository
        for file in os.listdir(file_absolute_path):
            if file.endswith(".html"):
                file_path = os.path.join(file_absolute_path, file)
                print('FILE PATH: ', file_path)
                #Applying the function extract_msc_page to every file HTML
                master_info = extract_msc_page(file_path) 
                all_master_info.extend(master_info)
              
print(all_master_info)

Then we're storing those information in a tsv file, one for each master. All the files are stored in a folder called 'Tsv files'. 

In [None]:
# creating the tsv file for each master

folder_name = "Tsv files"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
folder_path = "D:\Primo Semestre\ADM\HW3\Tsv files"

for i in range(0,len(all_master_info)):
    output_file = f"{folder_path}\course_{i+1}.tsv"
    with open(output_file, 'w', encoding='utf-8') as tsvfile:
    # Extract field names from the dictionaries in 'all_master_info'
        fieldnames = all_master_info[i].keys()
        tsvfile.write('\t'.join(fieldnames) + '\n')                  # write the header
        row = '\t'.join(str(all_master_info[i].get(field, '')) for field in fieldnames)
        tsvfile.write(row + '\n')

We can now create our dataframe, reading the data on all the tsv files we just created.

In [None]:
data_frames = []
file_name_list = glob.glob("Tsv files\course_*.tsv")    # take all the course_i.tsv files
for file in file_name_list:
    dF_tsv = pd.read_csv(file, sep='\t', header=0)      # create the data frame from the tsv file
    data_frames.append(dF_tsv)
# Concatenate all DataFrames in the list into a single DataFrame
dataset = pd.concat(data_frames, ignore_index=True)     # creating the whole dataframe from each tsv file

dataset = dataset[dataset.description != ''] # do not ocnsider all the rows that have an empty description

dataset.head()

**da eliminare poi questo**  perchè per semplicitò apriamo il dataset da un file json, ma in realtà dovremmo aprirlo dai file tsv

In [8]:
# opening the json file for the dataset (but it will be the tsv file)
path = r"D:\Primo Semestre\ADM\HW3\university_dataset.json"
dataset= pd.read_json(path)

# 2. Search Engine

### 2.0 Preprocessing

2.0.0 Preprocessing the text\
We created 3 functions, that are present in the *defs.py*, to perform the stemming, remove the stopwords and punctuation from the *description* field of our dataset.

In [17]:
from defs import clean_description_s

ImportError: cannot import name 'clean_description_s' from 'defs' (c:\Users\anton\Documents\GitHub\ADM---HW3\defs.py)

In [13]:
# 1. stemming
dataset['descr_stem'] = dataset['description'].apply(stem_description)

# 2. removing stopwords
dataset['description_clean'] = dataset['description'].apply(clean_description_s)

# 3. removing punctuation
dataset['description_clean'] = dataset['description'].apply(clean_description_p)

NameError: name 'clean_description_s' is not defined

2.0.1 preprocess the *fees* column

### 2.1 Conjuctive query

2.1.1 Create the index

We created the vocabulary assigning an unique ID to each word encoutered in the description field of the dataset, then created a csv file out of it, to store the information. 

In [None]:
# creating the vocabulary 
vocabulary = Counter(reduce(lambda x,y : x+y, dataset.description_clean)).keys()

# assign an unique ID to each word of the vocabulary using a pandas dataframe
terms = pd.DataFrame(data=list(vocabulary), columns=['term'])

terms
# creating a csv file for the vocabulary with index of each term
terms.to_csv('vocabulary.csv', index_label='term_id')

Now we can create the inverted index as a new column of the dataframe *terms* and store it in a txt file, called *Inverted Index.txt*

In [None]:
terms['reverse'] = terms.term.apply(lambda item: list(dataset.loc[dataset.description_clean.apply(lambda row: item in row)].index))
terms.head()

We now transform the inverted index in a dictionary in this form\
<code> {
term_id_1:[document_1, document_2, document_4],
term_id_2:[document_1, document_3, document_5, document_6],
    ...}
</code>

In [None]:
InvertedIndex = terms['reverse'].to_dict()
print(InvertedIndex) # the dictionary of the inverted index 

# store the inverted index in a txt file
with open('Inverted Index.txt', 'w') as file:

    for key, value in InvertedIndex.items():
        file.write(f'{key}: {value}\n')
file.close()

In [None]:
# read back the inverted index from the file.

file = open("Inverted Index.txt", "r")

inv_indx = dict()
txt = file.read().split("\n")

for i in range(len(txt)-1):
    line = txt[i].replace(":", "").replace("[", "").replace("]", "").replace("(", "").replace(")", "").replace(",", "").split(" ")
    inv_indx[int(line[0])] = []
    for j in range(1, len(line)):
            inv_indx[int(line[0])].append(int(line[j]))
            
file.close()

2.1.2 Execute the query

We created a function called *query_preprocess* that preprocesses the query just like we did in the preprocess of the description field.

In [None]:
#query = str(input())
query = 'advanced knoledge'
# formatting the query 
query = query_preprocess(query) 
print(query)

What we're going to do now to implement our Search Engine is:
- Find all the words of the query in the vocabulary and exctract each *term_id* of each word of the query.
- Find all the documents related to each *term_id* in the Inverted Index.
- Do the intersection of the lists of documents found.

In [None]:
vocabulary = pd.read_csv('vocabulary.csv') # read the vocabulary file into a dataframe
vocabulary = pd.DataFrame(vocabulary)

file = open("Inverted Index.txt", "r") # read the inverted index from the file.

inv_indx = dict()
txt = file.read().split("\n")

for i in range(len(txt)-1):
    line = txt[i].replace(":", "").replace("[", "").replace("]", "").replace("(", "").replace(")", "").replace(",", "").split(" ")
    inv_indx[int(line[0])] = []
    for j in range(1, len(line)):
            inv_indx[int(line[0])].append(int(line[j]))    
file.close()

# find the words of the query in the vocabulary
for w in query:
    term_ids = [vocabulary[vocabulary['term'] == w]['term_id'].values for w in query if w in vocabulary.term.values]

term_ids = [term_ids[x][0] for x in range(len(term_ids))] # exctract only the integers of the ids

# find the documents 
docs = [inv_indx[i] for i in term_ids]

# intersecting the two sets of documents we found out contain all the word of the query
intersection = list(set(docs[0]).intersection(*docs[1:]))
print('All the documents of the result:',intersection)

Now we can show the results of the query after it passed into the search engine

In [None]:
information_needed = ['courseName','universityName','description', 'url']
dataset.loc[intersection,information_needed] 

## 2.2 Conjunctive query & Ranking score

2.2.1 Inverted Index

We are now going to implement a ranking system, computing the *TF-IDF* for each word in each document, and then calculating the *cosine similarity* between the query vector and each one of the vectors corresponding to the documents.

In [None]:
# tf-idf 
# use the library scikit-learn: tfidf implementation VECTORIZED 
from sklearn.feature_extraction.text import TfidfVectorizer 

In [None]:
#Convert a collection of raw documents to a matrix of TF-IDF features

tfidf = TfidfVectorizer(input='content', lowercase=False, tokenizer=lambda text: text)
results = tfidf.fit_transform(dataset.description_clean) # fit data to train our model (but in our case is the same dataset)
results_dense = results.todense() # results are sparse documents that i want to convert into a dense one

# putting all into a dataframe where the index of the dataframe is each document id
tfidf_data = pd.DataFrame(results_dense.tolist(), index=dataset.index, columns=tfidf.get_feature_names_out()) 

Creating our second inverted index in the form:\
<code>{
term_id_1:[(document1, tfIdf_{term,document1}), (document2, tfIdf_{term,document2}), (document4, tfIdf_{term,document4}), ...],
term_id_2:[(document1, tfIdf_{term,document1}), (document3, tfIdf_{term,document3}), (document5, tfIdf_{term,document5}), (document6, tfIdf_{term,document6}), ...],
...}
</code>
And then storing it in a txt file called *Extented Inverted Index.txt*

In [None]:
# creating the second inverted index
from collections import defaultdict
extended_inverted_index = defaultdict(list)

# Iterate through each term in the inverted index
for term_id, doc_indices in inv_indx.items():
    # Iterate through each document index for the current term
    
    for doc_index in doc_indices:
        # Get the TF-IDF scores for the current document 
        word = vocabulary[vocabulary['term_id'] == term_id]['term'].values
     
        if  word[0] in tfidf_data.columns: # check if the word is in the tfidf
                tfidf_scores = tfidf_data.loc[doc_index,word[0]]
        else:
                continue

        # Append a tuple of (document_index, TF-IDF scores) to the term's list in the extended inverted index
        extended_inverted_index[term_id].append((doc_index, tfidf_scores))

# Convert the extended inverted index defaultdict to a regular dictionary
extended_inverted_index = dict(extended_inverted_index)
print(extended_inverted_index)

# save the extended inverted dictionary in a txt file as before
with open('Extended Inverted Index.txt', 'w') as file:

    for key, value in extended_inverted_index.items():
        file.write(f'{key}: {value}\n')
file.close()

In [None]:
# read the inverted index from the file.

file = open("Extended Inverted Index.txt", "r")

ext_inv_indx = dict()
txt = file.read().split("\n")

for i in range(len(txt)-1):
    line = txt[i].replace(":", "").replace("[", "").replace("]", "").replace("(", "").replace(")", "").replace(",", "").split(" ")
    ext_inv_indx[int(line[0])] = []
    for j in range(1, len(line)):
        if j%2 == 1:
            ext_inv_indx[int(line[0])].append((int(line[j]), float(line[j+1])))
            
file.close()

2.2.2 Execute the query

We created the query vector, putting a 1 if the word corresponding to the position is present in the query, 0 if is not.\
The vector for each description is each row of the dataframe tfidf, so no need to compute it for each document.

In [None]:
# create a vector for the query
query_vec = np.zeros(vocabulary.shape[0]) # inizialize the vector
for word in query:
    if word in tfidf_data.columns:
        term_id = vocabulary[vocabulary['term'] == word]['term_id']
        query_vec[term_id] = 1.0

# the documents matrix with all the tfidf is the dataframe tfidf

Now we need the cosine similarity function that we wrote in the *defs.py*, which simply exploits the definition of the cosine similaruty between two vectors that create the $\phi$ angle:

$cos(\phi) = \frac{\vec{q} \cdot \vec{d}}{|{\vec{q}}| \cdot |{\vec{d}}|}$

Instead of using a list sto store the similarity scores, we can use a heap structure, to make the sorting more efficient from a computational point of view.

In [None]:
heap = []
scores_dictionary = {}

# For every document
for doc_index in range(tfidf_data.shape[0]):
    doc_arr = tfidf_data.loc[doc_index, :].values
    # Compute the angle between the doc and the query vector
    cos_sim = a_cosine_similarity(query_vec, doc_arr)

    # Put the result in the dictionary
    scores_dictionary[doc_index] = cos_sim

    # Update the heap
    heapq.heappush(heap, (cos_sim, doc_index))  # Store both score and document index in the heap
print(scores_dictionary)

2.2.2 Execute the query with k = 20 

In [None]:
k = 20
top_k = heapq.nlargest(k, heap)
print(top_k)

top_doc_k = []

#fill the list of top_k_doc
for score, doc in top_k:
    top_doc_k.append(doc)
print(top_doc_k)

The results of the query

In [None]:
# adding the column 'similarity score' to the dataset
rinformation_needed = ['courseName','universityName','description', 'url']
results = dataset.loc[top_doc_k , information_needed] 
results['similarity'] = [round(s[0],3) for s in top_k]
results