# Характеристики моделей с сайта https://huggingface.co/

In [25]:
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm as tn
from urllib.request import urlopen
import json
import pandas as pd
from datetime import datetime

In [306]:
!pip freeze > requirements_to_notebook.txt

In [2]:
def get_models():
    '''Доступные модели на сайте huggingface'''
    cite = 'https://huggingface.co'
    url = "https://huggingface.co/Helsinki-NLP"
    url_data = requests.get(url)
    url_tree = BeautifulSoup(url_data.content, 'html.parser')
    data = url_tree.find_all('article', {
        'class': 'overview-card-wrapper group'
    })
    result = []
    for item in tn(data):
        href = item.find('a', {'class':"block p-2"}).get('href')
        model = requests.get(cite + href)    
        model_tree = BeautifulSoup(model.content, 'html.parser')
        result.append(model_tree)
    return tuple(result)


source = get_models()

  0%|          | 0/1340 [00:00<?, ?it/s]

In [57]:
len(source)

1340

In [301]:
def get_size_from_github(md):
    '''Новейший релиз и размер из github'''
    md_data = requests.get(md)
    md_tree = BeautifulSoup(md_data.content, 'html.parser')
    data = md_tree.find('article', {
        'class': 'markdown-body entry-content container-lg'
    })
    try:   
        data = [i for i in data.find_all('li') if 'download:' in str(i)]
    except AttributeError:
        file = urlopen(md)        
        text = "".join([line.decode("utf-8") for line in file])
        text = [i for i in text.split('# opus') if i]
        text = [i.split('\n') for i in text]
        dl = []
        for i in text:
            tmp = []
            for j in i:
                if 'http' in j:
                    tmp.append(j)
            dl.append(tmp)
        urls = {}
        for i in dl:            
            for j in i:
                if '.zip' in j:
                    url = 'http' + j.split('http')[-1][:-1]
                    date = "-".join(url.split('.zip')[0].split('-')[-3:][::-1])
                    date = datetime.strptime(date, '%d-%m-%Y').date()
                    urls[date] = i
                    break
        link_lst = urls[max(urls)]
        link = [i for i in link_lst if '.zip' in i][0]
        link = 'http' + link.split('http')[-1][:-1]
        site = urlopen(link)
        meta = site.info()
        size = round(int(meta.values()[0]) / 2 ** 20, 2)
        test = [i for i in link_lst if '.test.' in i][0]
        test = 'http' + test.split('http')[-1][:-1]
        return size, link, test    
    urls = {}
    for dt in data:
        try:
            url = dt.find('a').get('href')
            date = "-".join(url.split('.zip')[0].split('-')[-3:][::-1])
            date = datetime.strptime(date, '%d-%m-%Y').date()
            urls[date] = url
        except:
            continue
    if urls:
        while urls:
            try:
                link = urls[max(urls)]
                site = urlopen(link)
                meta = site.info()
                size = round(int(meta.values()[0]) / 2 ** 20, 2)
                return size, link
            except:
                del urls[max(urls)]            
    return None, None

In [293]:
def get_data_dicts(source):
    """Метаданные о каждой модели"""
    result, error = [], []
    for model_tree in tn(source):  
        records = model_tree.find('div', 
                                  {'prose dark:prose-light'}
                                 ).find_all('ul')
        scores = model_tree.find('div', {'max-w-full overflow-auto'})
        notions = records[0] if records else 0
        dct = {}
        if notions and scores:
            rows = notions.find_all('li') 
            key_scores = scores.find_all('thead')
            val_scores = scores.find_all('tbody')
            size_url = None
            for row in rows:
                column = row.find('p') if row.find_all('p') else row                
                if column and ': ' in column.text:
                    key, value = list(map(str.strip, 
                                          column.text.split(": ")))[:2]
                    if column.find('a'):                        
                        href = column.find('a').get('href')
                        if href.strip()[-3:] == '.md':
                            size_url = get_size_from_github(href)
                            dct[key] = href
                            dct['download original weights'] = size_url[1]
                            if size_url[1]:
                                date = size_url[1].split('.zip')[0]\
                                    .split('-')[-3:][::-1]                                    
                                dct['date'] = '.'.join(date)
                                dct['size (Mb)'] = size_url[0]                            
                            else:
                                dct['date'] = None
                                dct['size (Mb)'] = None
                        elif href.strip()[-4:] == '.txt':
                            dct[key] = href
                    elif key != 'download original weights':
                        dct[key] = value
                else:
                    if "comment" in dct:
                        dct["comment"] += ", " + str(column)
                    else:
                        dct["comment"] = str(column)
            if len(size_url) == 3:                
                dct['test set translations'] = size_url[2]
            dct["scores"] = []
            if key_scores and val_scores:
                ks, vs = key_scores[0].text.split(), val_scores[0].text.split()
                for k, v in zip(ks, vs):
                    if k == 'BLEU' or k == 'chr-F':
                        dct["scores"].append(f'{k}: {v}')
            if not dct["scores"]:
                records = model_tree.find('div', 
                                          {'prose dark:prose-light'}
                                         ).find_all('ul')
                scores = []
                for record in records[0].find_all('li'):
                    if 'BLEU' in record.text or 'chr-F' in record.text:
                        scores.append(record)
                local_score = {}
                for score in scores:
                    string = [s for s in score.text.split('\n') if s]
                    local_score[string[0]] = [
                        ": ".join(string[i:i + 2]) 
                        for i in range(3, len(string[3:]), 2)
                    ]
                dct["scores"] = local_score
            result.append(dct)
        else:
            error.append(model_tree)
    print(f'Успешно обработано страниц: {len(result)}')
    print(f'Необработано страниц: {len(error)}')
    return result, error


data, error = get_data_dicts(source)

  0%|          | 0/1340 [00:00<?, ?it/s]

Успешно обработано страниц: 1339
Необработано страниц: 1


In [302]:
len(data), len(error)

(1339, 1)

In [297]:
# множество столбцов сформированных эмпирическим путем
columns = {'source language name', 'pre-processing', 'source group', 
           'OPUS readme', 'scores', 'model', 'target language name', 
           'size (Mb)', 'dataset', 'test set translations', 
           'source languages', 'download original weights', 
           'source language(s)', 'target group', 'valid language labels', 
           'date', 'comment', 'test set scores', 'raw target language(s)', 
           'target language codes', 'source language codes',  
           'target language(s)', 'raw source language(s)', 'target languages'}

In [305]:
# объединение схожих столбцов
def get_frame(data, columns):
    template = dict.fromkeys(columns) 
    result = []
    for item in data:
        tmp = template.copy()
        for key, value in item.items():            
            if key in tmp:
                if key == 'source language(s)' or key == 'source language codes':
                    key = 'source languages'
                if key == 'target language(s)' or key == 'target language codes':
                    key = 'target languages'
                if key == 'source language name':
                    key = 'source group'
                if key == 'target language name':
                    key = 'target group'
                tmp[key] = value                
        result.append(tmp)
    return pd.DataFrame.from_dict(result)
    

get_frame(data, columns).head()

Unnamed: 0,source language name,test set translations,valid language labels,target language codes,raw target language(s),pre-processing,source group,test set scores,comment,target group,...,target languages,source language(s),download original weights,scores,model,dataset,target language name,target language(s),source languages,raw source language(s)
0,,https://object.pouta.csc.fi/Tatoeba-MT-models/...,,,ita,"normalization + SentencePiece (spm32k,spm32k)",French,https://object.pouta.csc.fi/Tatoeba-MT-models/...,,Italian,...,ita,,https://object.pouta.csc.fi/Tatoeba-MT-models/...,"[BLEU: 54.8, chr-F: 0.737]",transformer-align,,,,fra,fra
1,,https://object.pouta.csc.fi/Tatoeba-MT-models/...,>>mol<< >>ron<<,,mol ron,"normalization + SentencePiece (spm32k,spm32k)",German,https://object.pouta.csc.fi/Tatoeba-MT-models/...,<p>a sentence initial language token is requir...,Romanian,...,mol ron,,https://object.pouta.csc.fi/Tatoeba-MT-models/...,"[BLEU: 42.0, chr-F: 0.636]",transformer-align,,,,deu,deu
2,,https://object.pouta.csc.fi/Tatoeba-MT-models/...,,,,"normalization + SentencePiece (spm32k,spm32k)",Finnish,https://object.pouta.csc.fi/Tatoeba-MT-models/...,,English,...,eng,,https://object.pouta.csc.fi/Tatoeba-MT-models/...,"[BLEU: 27.1, chr-F: 0.550]",transformer-align,,,,fin,
3,,https://object.pouta.csc.fi/Tatoeba-MT-models/...,,,,"normalization + SentencePiece (spm32k,spm32k)",English,https://object.pouta.csc.fi/Tatoeba-MT-models/...,<p>a sentence initial language token is requir...,Romanian,...,mol ron,,https://object.pouta.csc.fi/Tatoeba-MT-models/...,"[BLEU: 33.5, chr-F: 0.610]",transformer-align,,,,eng,
4,,https://object.pouta.csc.fi/Tatoeba-MT-models/...,,,,"normalization + SentencePiece (spm32k,spm32k)",English,https://object.pouta.csc.fi/Tatoeba-MT-models/...,,Japanese,...,jpn,,https://object.pouta.csc.fi/Tatoeba-MT-models/...,"[BLEU: 15.2, chr-F: 0.258]",transformer-align,,,,eng,


In [300]:
# сохранение в таблицу table.xlsx
sotr_columns = ['date', 'source languages', 'target languages', 
                'source group', 'target group', 'size (Mb)', 
                'scores', 'model',                
                'download original weights', 'test set translations',
                'OPUS readme', 'pre-processing', 'comment'
                # 'dataset', # неинформативен
                # 'test set scores', # дублирование
               ]

df = get_frame(data, columns)
file = 'table.xlsx'
with pd.ExcelWriter(file) as writer:
    df.to_excel(writer, sheet_name="Модели", columns=sotr_columns)

---

# Структура моделей

In [4]:
import pandas as pd
file = 'table.xlsx'
df = pd.read_excel(file, index_col=0)
source, target = df['source languages'], df['target languages']
s, t = source.to_list(), target.to_list()

## Всего доступно языковых пар

In [17]:
min(len(s), len(t))

1339

## Уникальных языковых пар

In [20]:
languages = [(i, j) for i, j in zip(s, t)]
uniq = set(languages)
len(uniq)

1330

## Повторы

In [26]:
languages = [(i, j) for i, j in zip(s, t)]
len(set([k for k in languages if languages.count(k) > 1]))

9

## Симметричные пары

In [8]:
both = []
for i in range(len(s)):
    for j in range(len(s)):
        if len(s[i]) <= 5 and len(t[i]) <= 5:
            if s[i] == t[j] and t[i] == s[j] and\
            s[i] != t[i] and s[j] != t[j] and\
            (s[i], t[i]) not in both:
                both.append((s[i], t[i]))
len(both)

938

## Число языков симметричных пар

In [9]:
two = []
for i in range(len(s)):
    for j in range(len(s)):
        if len(s[i]) <= 5 and len(t[i]) <= 5:
            if s[i] == t[j] and t[i] == s[j] and\
            s[i] != t[i] and s[j] != t[j] and\
            (t[i], s[i]) not in two and\
            (s[i], t[i]) not in two:
                two.append((s[i], t[i]))
len(two)

469

## Ассиметричные

In [11]:
both_set = set(both)
one = []
for i in range(len(s)):
    for j in range(len(s)):
        if len(s[i]) <= 5 and len(t[i]) <= 5 and (s[i], t[i]) not in both_set:
            if (s[i], t[i]) not in one and (t[i], s[i]) not in one and\
            ((s[i] == t[j] and t[i] != s[j]) or (s[i] != t[j] and t[i] == s[j])):
                one.append((s[i], t[i]))
len(one)

194

## Комбинации языков (длина названия языков больше 5 символов)

In [12]:
long = set()
for i in range(len(s)):
    for j in range(len(s)):
        if len(s[i]) > 5 or len(t[i]) > 5:
            long.add((s[i], t[i]))
len(long)

198

## Однородные

In [29]:
repeat = set()
for i in range(len(s)):
    if len(s[i]) <= 5 and len(t[i]) <= 5 and s[i] == t[i]:
        repeat.add((s[i], t[i]))
len(repeat)

4