### Import libraries and load dataset

In [1]:
# import usual libraries
import time
import os
import gc
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch
import transformers

transformers.logging.set_verbosity_error()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = 'gpu'

import cudf

# import lightautoml_gpu
from lightautoml_gpu.automl.presets.text_presets import TabularNLPAutoML
from lightautoml_gpu.automl.presets.gpu.text_gpu_presets import TabularNLPAutoMLGPU
from lightautoml_gpu.tasks import Task
from lightautoml_gpu.dataset.utils import roles_parser

In [2]:
# define nlp constants
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 300
TARGET_NAME = 'is_good'

torch.set_num_threads(N_THREADS)
torch.cuda.empty_cache()

In [3]:
# load bankiru dataset
DATASET_FULLNAME = '../../data/nlp/bankiru_isgood.csv'

# here only 1000 samples are used for time reasons (for a detailed check, one needs to use larger number:
# 100k-500k)
data = pd.read_csv(DATASET_FULLNAME)[["message", "title", "is_good"]].fillna("")[:1000]

In [4]:
# split data
tr_data, te_data = train_test_split(data,
        test_size=TEST_SIZE,
        stratify=data[TARGET_NAME],
        random_state=RANDOM_STATE
    )
print(data.head())
tr_data = pd.DataFrame(data, index=[i for i in range(tr_data.shape[0])])
te_data = pd.DataFrame(data, index=[i for i in range(te_data.shape[0])])

print(f'Data splitted. Parts sizes: tr_data = {tr_data.shape}, te_data = {te_data.shape}')

                                             message  \
0                                      Здравствуйте.   
1  https://ibb.co/qdvV4Fyhttps://ibb.co/Pzqxd2vht...   
2  Добрый день! Сегодня я обращалась на горячую л...   
3  31 марта 2021 года заключил договор рефинансир...   
4  Заметил, что с моей кредитной карты банка Тинь...   

                                               title  is_good  
0                                   Создание шаблона        0  
1                                             Кизляр        1  
2                               Хорошее обслуживание        1  
3  Пинают меня в страховую, страховая обратно в банк        0  
4               Списание средств сторонними услугами        1  
Data splitted. Parts sizes: tr_data = (800, 3), te_data = (200, 3)


In [5]:
# define task and roles
task = Task('binary', device=device)

roles = {
    'text': ['message', 'title'],
    'target': TARGET_NAME,
}
print(roles_parser(roles))

{'message': 'text', 'title': 'text', 'is_good': 'target'}


In [6]:
def run_automl(automl, tr_data, te_data):
    t0 = time.time()
    oof_pred = automl.fit_predict(tr_data, roles=roles, verbose=1)
    t1 = time.time()
    print('Elapsed time (train): {}'.format(t1 - t0))

    t0 = time.time()
    te_pred = automl.predict(te_data)
    t1 = time.time()
    print('Elapsed time (test): {}'.format(t1 - t0))

    not_nan = np.any(~np.isnan(oof_pred.data), axis=1)
    print(f'OOF score: {roc_auc_score(tr_data[TARGET_NAME].values[not_nan], oof_pred.data[not_nan][:, 0])}')
    print(f'TEST score: {roc_auc_score(te_data[TARGET_NAME].values, te_pred.data[:, 0])}')

### linear_l2 model with different text features

#### tfidf text features

In [7]:
n_components = 100
n_oversample = 0
ngram = (1, 1)

automl = TabularNLPAutoMLGPU(task=task,
            timeout=600,
            cpu_limit=1,
            gpu_ids='0',
            client=None,
            general_params={
                'nested_cv': False,
                'use_algos': [['linear_l2']]
            },
            reader_params={
                'npartitions': 2
            },
            text_params={
                'lang': 'ru',
                'verbose': False,
                'use_stem': False,
            },
            tfidf_params={
                'n_components': n_components,
                'n_oversample': n_oversample,
                'tfidf_params': {'ngram_range': ngram}
            },
            linear_pipeline_params={
                'text_features': "tfidf"
            }
            )

In [8]:
run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

[19:28:40] Stdout logging level is INFO.
[19:28:40] Task: binary

[19:28:40] Start automl preset with listed constraints:
[19:28:40] - time: 600.00 seconds
[19:28:40] - CPU: 1 cores
[19:28:40] - memory: 16 GB

[19:28:40] Train data shape: (800, 3)
[19:28:40] Layer [1m1[0m train process start. Time left 599.96 secs
[19:28:43] Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...
[19:28:43] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m (orig) =====
[19:28:43] Linear model: C = 1e-05 score = 0.8947904706001282
[19:28:43] Linear model: C = 5e-05 score = 0.887622058391571
[19:28:43] Linear model: C = 0.0001 score = 0.8875912427902222
[19:28:43] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m (orig) =====
[19:28:43] Linear model: C = 1e-05 score = 0.8541589379310608
[19:28:43] Linear model: C = 5e-05 score = 0.8557347059249878
[19:28:43] Linear model: C = 0.0001 score = 0.8557347059249878
[19:28:44] Linear model: C = 0.0005 score = 

814

#### tfidf_subword features

The following __text_params__ work only with __tfidf_subword__ text features:   
__vocab_path__ - path to vocabulary .txt file,  
__data_path__ - .txt file (saved pd.Series) for the tokenizer to be trained on (if vocab is not specified)  
__is_hash__ - True means vocab is not raw vocab but was transformed with hash_vocab function from cudf,  
__max_length__ - max number of tokens to leave in one text (exceeding ones would be truncated)  
__tokenizer__ - ["bpe" or "wordpiece"] if vocab is None. Type of tokenizer to be trained  
__vocab_size__ - vocabulary size for trained tokenizer  
__save_path__ - path where trained vocabulary would be saved to  

Overall, there are 3 possible scenarios to run tfidf_subword text features:  
1) __vocab_path__ is defined, __is_hash__ = True. It means that __vocab_path__ contains path to a hashed version of vocabulary. No additional transformation is needed. This is the optimal usage (all vocabulary pre-processing was done in advance).
2) __vocab_path__ is defined, __is_hash__ = False. __vocab_path__ contains path to a vocabulary with raw words, it needs to be transformed to a hash version. This is the second fastest option.
3) __vocab_path__ is not defined, __data_path__ is defined (with additional parameters __tokenizer__, __vocab_size__ and __save_path__). Only .txt file of a dataframe is available. Note, that it works not with a dataframe itself but with its .txt version. One should be careful with tokenizer settings. Recommended way is to study the dataset in advance, tweak tokenizer settings and create the vocabulary aside from LAMA pipeline. The quality of __tfidf_subword__ text features highly depend on the quality of the used tokenizer. 

Prepare data for all scenarios. Imagine that only pd.Series of text data is available.

In [9]:
import codecs

In [10]:
# Note: how to create .txt dataframe (one should save only text corpus)
# This is an example, it is not necessary to run it

# Step 1. Choose your representative text data and save it to .txt file. Here only one column of text dataset 
# is taken but sometimes it might be a good idea to concatenate all text columns instead of choosing one.
data_text = data['message']
file_data_text = 'bankiru_isgood_test.txt'
with codecs.open(file_data_text, 'w+', 'utf-8') as f:
    for i in range(len(data_text)):
        f.write(data_text.iloc[i] + '\n')

In [11]:
# Note: how to use huggingface tokenizer to create vocabulary from .txt dataframe
# This is an example, it is not necessary to run it

# Step 2. Having a text data file, train token vocabulary.
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer, WordPieceTrainer

tokenizer = 'bpe' # or 'wordpiece'
vocab_size = 30000
data_path = file_data_text # path to a .txt pd.Series of text data
vocab_save_path = f"{tokenizer}_{vocab_size // 1000}k_test.txt"

if tokenizer == "bpe":
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    trainer = BpeTrainer(
        vocab_size=vocab_size, special_tokens=["[UNK]", "[SEP]", "[CLS]"]
    )
else:
    tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    trainer = WordPieceTrainer(
        vocab_size=vocab_size, special_tokens=["[UNK]", "[SEP]", "[CLS]"]
    )
tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
tokenizer.pre_tokenizer = Whitespace()

tokenizer.train([data_path], trainer) # train tokenizer on out .txt text data
trained_vocab = tokenizer.get_vocab()

# save trained vocabulary to a .txt file
with codecs.open(vocab_save_path, 'w+', 'utf-8') as f:
    for key in trained_vocab.keys():
        f.write(key + '\n')






In [12]:
# Note: how to create hash vocabulary from word .txt vocabulary
# This is an example, it is not necessary to run it

# Step 3. Having .txt vocabulary file, create a hashed version of it which would be used by 
# cudf.SubwordTokenizer
from cudf.utils.hash_vocab_utils import hash_vocab

vocab_save_path_hash = vocab_save_path.split('.')[0]+'_hash.txt'
hash_vocab(vocab_save_path, vocab_save_path_hash)

Attempting to build table using 1.499947n space
Longest bin was 12
Processing bin 0 / 4702 of size = 2


  return ((a * k + b) % PRIME) % size


Processing bin 500 / 4702 of size = 3
Processing bin 1000 / 4702 of size = 3
Processing bin 1500 / 4702 of size = 3
Processing bin 2000 / 4702 of size = 3
Processing bin 2500 / 4702 of size = 6
Processing bin 3000 / 4702 of size = 4
Processing bin 3500 / 4702 of size = 6
Processing bin 4000 / 4702 of size = 2
Processing bin 4500 / 4702 of size = 2
Final table size 18810 elements compared to 18810 for original
Max bin length was 12
All present tokens return correct value.


In [13]:
# Alternative Step 1-2. Download existing vocabulary (one could use data from huggingfsce models).

# Download standard bert English vocabulary
!wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt
bert_vocab_en_path = 'bert-base-uncased-vocab.txt'
# Download bert Russian vocabulary
!wget https://s3.amazonaws.com/models.huggingface.co/bert/DeepPavlov/rubert-base-cased/vocab.txt
bert_vocab_ru_path = 'vocab.txt'

--2022-11-21 19:28:53--  https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.132.77, 52.216.136.166, 52.216.140.22, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.132.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 231508 (226K) [text/plain]
Saving to: ‘bert-base-uncased-vocab.txt’


2022-11-21 19:28:55 (506 KB/s) - ‘bert-base-uncased-vocab.txt’ saved [231508/231508]

--2022-11-21 19:28:55--  https://s3.amazonaws.com/models.huggingface.co/bert/DeepPavlov/rubert-base-cased/vocab.txt
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.140.22, 54.231.165.104, 52.216.136.166, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.140.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1649718 (1,6M) [text/plain]
Saving to: ‘vocab.txt’


2022-11-21 19:29:00 (366 KB/s) - ‘vocab.txt’ saved [1649718/1649718]



In [14]:
# True to use data generated in this notebook, False to use data available in zip dataset archive
use_test_data = True 

if use_test_data:
    bankiru_info = {'path': '../../data/nlp/bankiru_isgood.csv',
                    'text_roles': ['message', 'title'],
                    'target': 'is_good',
                    'task': 'binary',
                    'lang': 'ru',
                    'csv2text': file_data_text,
                    'vocab_path': vocab_save_path,
                    'vocab_hash_path': vocab_save_path_hash
    }
else:
    bankiru_info = {'path': '../../data/nlp/bankiru_isgood.csv',
                    'text_roles': ['message', 'title'],
                    'target': 'is_good',
                    'task': 'binary',
                    'lang': 'ru',
                    'csv2text': '../../data/nlp/csv2text/bankiru_isgood.txt',
                    'vocab_path': '../../data/nlp/vocab/bankiru_isgood_vocab.txt',
                    'vocab_hash_path': '../../data/nlp/vocab_hash/bankiru_isgood_vocab_hash.txt'
    }

In [15]:
# scenario 1
automl = TabularNLPAutoMLGPU(task=task, 
                              timeout=600, 
                              cpu_limit=1, 
                              gpu_ids='0', 
                              client=None,
                              general_params={
                                  'nested_cv': False,
                                  'use_algos': [['linear_l2']]
                              },
                              reader_params={
                                  'npartitions': 2
                              },
                              text_params={
                                  'lang': 'ru',
                                  'verbose': False,
                                  'use_stem': False,
                                  'vocab_path': bankiru_info['vocab_hash_path'],
                                  'is_hash': True,
                                  # 'data_path': file_name,
                                  # 'tokenizer': "bpe",
                                  # 'vocab_size': 30000
                              },
                              tfidf_params={
                                  'n_components': n_components,
                                  'n_oversample': n_oversample,
                                  'tfidf_params': {'ngram_range': ngram}
                              },
                              linear_pipeline_params={
                                  'text_features': 'tfidf_subword'
                              },
                              )

In [16]:
run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

[19:29:00] Stdout logging level is INFO.
[19:29:00] Task: binary

[19:29:00] Start automl preset with listed constraints:
[19:29:00] - time: 600.00 seconds
[19:29:00] - CPU: 1 cores
[19:29:00] - memory: 16 GB

[19:29:00] Train data shape: (800, 3)
[19:29:00] Layer [1m1[0m train process start. Time left 599.98 secs
[19:29:02] Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...
[19:29:02] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m (orig) =====
[19:29:02] Linear model: C = 1e-05 score = 0.8910208940505981
[19:29:02] Linear model: C = 5e-05 score = 0.8903102874755859
[19:29:02] Linear model: C = 0.0001 score = 0.8903720378875732
[19:29:02] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m (orig) =====
[19:29:02] Linear model: C = 1e-05 score = 0.8680941462516785
[19:29:02] Linear model: C = 5e-05 score = 0.8635520935058594
[19:29:02] Linear model: C = 0.0001 score = 0.8636138439178467
[19:29:02] ===== Start working with [1mfol

800

In [17]:
# scenario 2
automl = TabularNLPAutoMLGPU(task=task, 
                              timeout=600, 
                              cpu_limit=1, 
                              gpu_ids='0', 
                              client=None,
                              general_params={
                                  'nested_cv': False,
                                  'use_algos': [['linear_l2']]
                              },
                              reader_params={
                                  'npartitions': 2
                              },
                              text_params={
                                  'lang': 'ru',
                                  'verbose': False,
                                  'use_stem': False,
                                  'vocab_path': bankiru_info['vocab_path'],
                                  'is_hash': False,
                                  # 'data_path': file_name,
                                  # 'tokenizer': "bpe",
                                  # 'vocab_size': 30000
                              },
                              tfidf_params={
                                  'n_components': n_components,
                                  'n_oversample': n_oversample,
                                  'tfidf_params': {'ngram_range': ngram}
                              },
                              linear_pipeline_params={
                                  'text_features': 'tfidf_subword'
                              },
                              )

In [18]:
run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

[19:29:03] Stdout logging level is INFO.
[19:29:03] Task: binary

[19:29:03] Start automl preset with listed constraints:
[19:29:03] - time: 600.00 seconds
[19:29:03] - CPU: 1 cores
[19:29:03] - memory: 16 GB

[19:29:03] Train data shape: (800, 3)
[19:29:03] Layer [1m1[0m train process start. Time left 599.98 secs
Attempting to build table using 1.499947n space
Longest bin was 12
Processing bin 0 / 4702 of size = 2


  return ((a * k + b) % PRIME) % size


Processing bin 500 / 4702 of size = 3
Processing bin 1000 / 4702 of size = 3
Processing bin 1500 / 4702 of size = 3
Processing bin 2000 / 4702 of size = 3
Processing bin 2500 / 4702 of size = 6
Processing bin 3000 / 4702 of size = 4
Processing bin 3500 / 4702 of size = 6
Processing bin 4000 / 4702 of size = 2
Processing bin 4500 / 4702 of size = 2
Final table size 18810 elements compared to 18810 for original
Max bin length was 12
All present tokens return correct value.
[19:29:12] Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...
[19:29:12] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m (orig) =====
[19:29:12] Linear model: C = 1e-05 score = 0.8885180354118347
[19:29:12] Linear model: C = 5e-05 score = 0.8906809687614441
[19:29:12] Linear model: C = 0.0001 score = 0.8904646635055542
[19:29:12] Linear model: C = 0.0005 score = 0.8903411030769348
[19:29:12] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m (orig) =====
[19:29:12

788

In [19]:
# scenario 3
automl = TabularNLPAutoMLGPU(task=task, 
                              timeout=600, 
                              cpu_limit=1, 
                              gpu_ids='0', 
                              client=None,
                              general_params={
                                  'nested_cv': False,
                                  'use_algos': [['linear_l2']]
                              },
                              reader_params={
                                  'npartitions': 2
                              },
                              text_params={
                                  'lang': 'ru',
                                  'verbose': False,
                                  'use_stem': False,
                                  'vocab_path': None,
                                  'data_path': bankiru_info['csv2text'],
                                  'tokenizer': "bpe",
                                  'vocab_size': 30000
                              },
                              tfidf_params={
                                  'n_components': n_components,
                                  'n_oversample': n_oversample,
                                  'tfidf_params': {'ngram_range': ngram}
                              },
                              linear_pipeline_params={
                                  'text_features': 'tfidf_subword'
                              },
                              )

In [21]:
run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

[19:29:21] Stdout logging level is INFO.
[19:29:21] Task: binary

[19:29:21] Start automl preset with listed constraints:
[19:29:21] - time: 600.00 seconds
[19:29:21] - CPU: 1 cores
[19:29:21] - memory: 16 GB

[19:29:21] Train data shape: (800, 3)
[19:29:21] Layer [1m1[0m train process start. Time left 599.98 secs



Attempting to build table using 1.499947n space
Longest bin was 12
Processing bin 0 / 4702 of size = 2


  return ((a * k + b) % PRIME) % size


Processing bin 500 / 4702 of size = 3
Processing bin 1000 / 4702 of size = 3
Processing bin 1500 / 4702 of size = 3
Processing bin 2000 / 4702 of size = 3
Processing bin 2500 / 4702 of size = 6
Processing bin 3000 / 4702 of size = 4
Processing bin 3500 / 4702 of size = 6
Processing bin 4000 / 4702 of size = 2
Processing bin 4500 / 4702 of size = 2
Final table size 18810 elements compared to 18810 for original
Max bin length was 12
All present tokens return correct value.
[19:29:30] Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...
[19:29:30] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m (orig) =====
[19:29:31] Linear model: C = 1e-05 score = 0.8909590840339661
[19:29:31] Linear model: C = 5e-05 score = 0.8903101682662964
[19:29:31] Linear model: C = 0.0001 score = 0.8903102874755859
[19:29:31] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m (orig) =====
[19:29:31] Linear model: C = 1e-05 score = 0.8661784529685974
[19:29:31]

1583

#### embed text features

In [None]:
# One should note that gensim package was removed, now only torchnlp embeddings are available of fixed
# dimensionality
model_name = 'random_lstm'

automl = TabularNLPAutoMLGPU(task=task,
                              timeout=600,
                              cpu_limit=1,
                              gpu_ids='0',
                              client=None,
                              general_params={
                                  'nested_cv': False,
                                  'use_algos': [['linear_l2']]
                              },
                              reader_params={
                                  'npartitions': 2
                              },
                              text_params={
                                  'lang': 'ru',
                                  'verbose': False,
                                  'use_stem': False
                              },
                              autonlp_params={
                                  'model_name': model_name,
                                  'sent_scaler': 'l1',
                                  'embedding_model': 'fasttext', # now this has a different meaning
                                  'cache_dir': None
                              },
                              linear_pipeline_params={
                                  'text_features': 'embed'
                              },
                              )
run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

In [None]:
model_name = 'borep'
automl = TabularNLPAutoMLGPU(task=task,
                              timeout=600,
                              cpu_limit=1,
                              gpu_ids='0',
                              client=None,
                              general_params={
                                  'nested_cv': False,
                                  'use_algos': [['linear_l2']]
                              },
                              reader_params={
                                  'npartitions': 2
                              },
                              text_params={
                                  'lang': 'ru',
                                  'verbose': False,
                                  'use_stem': False
                              },
                              autonlp_params={
                                  'model_name': model_name,
                                  'sent_scaler': None,
                                  'embedding_model': 'fasttext', # now this has a different meaning
                                  'cache_dir': None
                              },
                              linear_pipeline_params={
                                  'text_features': 'embed'
                              },
                              )

run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

In [None]:
model_name = 'random_lstm_bert'
automl = TabularNLPAutoMLGPU(task=task,
                              timeout=600,
                              cpu_limit=1,
                              gpu_ids='0',
                              client=None,
                              general_params={
                                  'nested_cv': False,
                                  'use_algos': [['linear_l2']]
                              },
                              reader_params={
                                  'npartitions': 2
                              },
                              text_params={
                                  'lang': 'ru',
                                  'verbose': False,
                                  'use_stem': False
                              },
                              autonlp_params={
                                  'model_name': model_name,
                                  'sent_scaler': 'l2',
                                  'embedding_model': 'fasttext', # now this has a different meaning
                                  'cache_dir': None
                              },
                              linear_pipeline_params={
                                  'text_features': 'embed'
                              },
                              )

run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

In [None]:
model_name = 'pooled_bert'
automl = TabularNLPAutoMLGPU(task=task,
                              timeout=600,
                              cpu_limit=1,
                              gpu_ids='0',
                              client=None,
                              general_params={
                                  'nested_cv': False,
                                  'use_algos': [['linear_l2']]
                              },
                              reader_params={
                                  'npartitions': 2
                              },
                              text_params={
                                  'lang': 'ru',
                                  'verbose': False,
                                  'use_stem': False
                              },
                              autonlp_params={
                                  'model_name': model_name,
                                  'sent_scaler': 'l2',
                                  'embedding_model': 'fasttext', # now this has a different meaning
                                  'cache_dir': None
                              },
                              linear_pipeline_params={
                                  'text_features': 'embed'
                              },
                              )

run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

In [None]:
model_name = 'wat'
automl = TabularNLPAutoMLGPU(task=task,
                              timeout=600,
                              cpu_limit=1,
                              gpu_ids='0',
                              client=None,
                              general_params={
                                  'nested_cv': False,
                                  'use_algos': [['linear_l2']]
                              },
                              reader_params={
                                  'npartitions': 2
                              },
                              text_params={
                                  'lang': 'ru',
                                  'verbose': False,
                                  'use_stem': False
                              },
                              autonlp_params={
                                  'model_name': model_name,
                                  'sent_scaler': None,
                                  'embedding_model': 'fasttext', # now this has a different meaning
                                  'cache_dir': None
                              },
                              linear_pipeline_params={
                                  'text_features': 'embed'
                              },
                              )

run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

### catboost and xgb algos with tfidf text_features

In [None]:
# catboost
n_components = 100
n_oversample = 0
ngram = (1, 1)

automl = TabularNLPAutoMLGPU(task=task,
            timeout=600,
            cpu_limit=1,
            gpu_ids='0',
            client=None,
            general_params={
                'nested_cv': False,
                'use_algos': [['cb']]
            },
            reader_params={
                'npartitions': 2
            },
            text_params={
                'lang': 'ru',
                'verbose': False,
                'use_stem': False,
            },
            tfidf_params={
                'n_components': n_components,
                'n_oversample': n_oversample,
                'tfidf_params': {'ngram_range': ngram}
            },
            linear_pipeline_params={
                'text_features': "tfidf"
            }
            )

run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()

In [None]:
# xgboost
n_components = 100
n_oversample = 0
ngram = (1, 1)

automl = TabularNLPAutoMLGPU(task=task,
            timeout=600,
            cpu_limit=1,
            gpu_ids='0',
            client=None,
            general_params={
                'nested_cv': False,
                'use_algos': [['xgb']]
            },
            reader_params={
                'npartitions': 2
            },
            text_params={
                'lang': 'ru',
                'verbose': False,
                'use_stem': False,
            },
            tfidf_params={
                'n_components': n_components,
                'n_oversample': n_oversample,
                'tfidf_params': {'ngram_range': ngram}
            },
            linear_pipeline_params={
                'text_features': "tfidf"
            }
            )

run_automl(automl, tr_data, te_data)
torch.cuda.empty_cache()
gc.collect()