In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

from sentence_transformers import SentenceTransformer


import torch
import re

In [542]:
torch.set_num_threads(8)
xlmr = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')

In [2]:
r = requests.get('https://www.epfl.ch')

In [3]:
soup = BeautifulSoup(r.text, parser='lxml')

In [20]:
a_tags = soup.find_all('a', href=True)
links = [a.get('href', '') for a in a_tags]
links = [clean_link(link) for link in links]
links = [link for link in links if len(link) != 0]
words = [w for w in ' '.join(links).split(' ') if len(w) != 0]

n_words = 10
most_frequent_words = pd.Series(words).value_counts()[:n_words].index.values

In [21]:
most_frequent_words

array(['fr', 'research', 'education', 'campus', 'schools', 'services',
       'domains', 'de', 'phd', 'et'], dtype=object)

In [19]:
[a.get('href', '') for a in a_tags]

['#content',
 'https://www.epfl.ch/fr/',
 'https://www.epfl.ch/about/fr/a-propos/',
 'https://www.epfl.ch/education/fr/education/',
 'https://www.epfl.ch/research/fr/recherche/',
 'https://www.epfl.ch/innovation/fr/innovation-2/',
 'https://www.epfl.ch/schools/fr/facultes/',
 'https://www.epfl.ch/campus/fr/campus/',
 'https://www.epfl.ch/campus/security-safety/sante/coronavirus-covid-19/',
 '#',
 '#',
 '#',
 'https://www.epfl.ch/en/',
 'https://actu.epfl.ch/news/les-10-actualites-qui-ont-fait-2020',
 'https://actu.epfl.ch/news/les-10-actualites-qui-ont-fait-2020',
 'https://actu.epfl.ch/news/galactic-chloe-revele-une-autre-face-de-la-planete',
 'https://actu.epfl.ch/news/la-premiere-technologie-endovasculaire-qui-acced-2',
 'https://actu.epfl.ch/search/fr/Mediacom',
 'https://actu.epfl.ch/news/j-apprends-aux-etudiants-a-collaborer',
 'https://actu.epfl.ch/news/enseigner-me-rend-heureux',
 'https://actu.epfl.ch/news/coach-coordinateur-facilitateur-et-avant-tout-navi',
 'https://actu.epf

In [5]:
def clean_link(link):
    link = re.sub(r"www.|http://|https://|[0-9]+", '', link)
    link = re.sub(r"-|_|=|\?|:", ' ', link)
    link = link.split('/')[1:]
    return ' '.join(link).strip()



In [620]:
def embed_links(soup, transformer):
    a_tags = soup.find_all('a', href=True)
    links = [a.get('href', '') for a in a_tags]
    links = [clean_link(link) for link in links]
    links = [link for link in links if len(link) != 0]
    words = [w for w in ' '.join(links).split(' ') if len(w) != 0]
    
    n_words = 10
    most_frequent_words = pd.Series(words).value_counts()[:n_words].index.values
    
    if len(most_frequent_words) == 0:
        return None
    
    links_emb = transformer.encode(most_frequent_words)
    
    if links_emb.size == 0:
        return None
    
    return links_emb.mean(axis=0) # mean of the sentences 
    

In [621]:
embed_links(soup, xlmr)

array([ 1.28063127e-01,  2.72776280e-02,  2.93178320e-01, -5.74002042e-03,
        1.74842030e-01, -1.04153171e-01, -1.62600875e-01,  3.18195760e-01,
        9.75115150e-02, -2.23001748e-01,  2.85434369e-02,  2.64058448e-02,
        3.78514305e-02, -2.14019865e-02,  2.48532102e-01, -7.61218220e-02,
        1.24644950e-01,  1.04842141e-01, -1.07002789e-02, -2.32465431e-01,
       -2.60384411e-01, -1.17473744e-01,  1.27646565e-01,  8.14175606e-02,
        8.74497592e-02,  1.56778067e-01,  2.77026355e-01,  1.97998837e-01,
        1.49270609e-01, -1.45116061e-01,  1.26202300e-01, -1.98141634e-01,
        7.25111812e-02, -1.42121539e-01, -1.58592880e-01,  1.09827537e-02,
        2.75733173e-01,  3.92565131e-02, -1.48483500e-01, -2.00054236e-03,
        8.37716907e-02,  5.29490635e-02, -6.67601004e-02, -7.36343563e-02,
        8.31181332e-02,  9.59968120e-02, -2.22690910e-01, -1.26293572e-02,
       -1.43023923e-01, -7.96872526e-02,  2.54663587e-01, -8.40858892e-02,
        2.12102026e-01,  

In [6]:
a_tags = soup.find_all('a', href=True)
links = [a.get('href', '') for a in a_tags]
links = [clean_link(link) for link in links]
links = [link for link in links if len(link) != 0]

In [7]:
links[:10]

['fr',
 'about fr a propos',
 'education fr education',
 'research fr recherche',
 'innovation fr innovation',
 'schools fr facultes',
 'campus fr campus',
 'campus security safety sante coronavirus covid',
 'en',
 'news les  actualites qui ont fait']

In [8]:
wlinks = [w for w in ' '.join(links).split(' ') if len(w) != 0]

In [611]:
pd.Series(wlinks).value_counts()[:10].index.values

array(['about', 'news', 'research', 'campus', 'events', 'life', 'and',
       'caltech', 'visit', 'quick'], dtype=object)