# Citation project

In [1]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m220.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=db3dda53772080eb93c8cd0439f8461af7a2453144e374884fef459e205f9783
  Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9
Successfully built sentence_transformers
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.2.2
[0m

In [70]:
import pickle
import re
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump

from catboost import CatBoostClassifier, Pool
import torch
import transformers as ppb
import warnings
from typing import Union
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore')

Original of this notebook: https://www.kaggle.com/shamankovnikolay/classification/edit

## Data Preparation

In [3]:
path = '../input/cndbv13/test1.json'
df = pd.read_json(path, lines=True, chunksize = 450000)
df = next(iter(df))


In [4]:
with open('../input/labels-idx/idx_labels.pickle', 'rb') as idx_f:
    data_idx = pickle.load(idx_f)

In [5]:
data = dict(data_idx)
list(data.keys())[0:10]

[40, 42, 43, 48, 50, 52, 53, 58, 63, 64]

In [6]:
#prepare data
texts_with_empty = np.array(df['abstract'])
print(type(texts_with_empty))
texts = texts_with_empty[list(data.keys())]
assert len(texts) == len(data_idx)

<class 'numpy.ndarray'>


In [7]:
new_df = pd.DataFrame(zip(texts, list(data.values())), columns=['abstracts', 'idxs'])

In [8]:
new_df.head()

Unnamed: 0,abstracts,idxs
0,Drought is the first place in all the natural ...,23
1,As process variations become a significant pro...,22
2,360° represents the concerns that are addresse...,24
3,"""2BTextures"", a two-movement audio/visual expe...",14
4,Constructing a system that can cope with a dyn...,7


## Embedding

In [9]:
# model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
# tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
# model = model_class.from_pretrained(pretrained_weights)

In [10]:
model = SentenceTransformer('all-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [11]:
abstr = list(new_df['abstracts'])

In [12]:
texts_embeddings = model.encode(abstr, batch_size=128, show_progress_bar=True, convert_to_tensor=False)

Batches:   0%|          | 0/2453 [00:00<?, ?it/s]

In [13]:
#tokenized = abstr.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))   # Очень долго!

In [14]:
m = np.matrix(texts_embeddings)

In [15]:
#new_df = pd.DataFrame(zip(texts_embeddings, list(data.values())), columns=['vectors', 'labels'])  #Очень долго страдал, написав эту дичь
new_df = pd.DataFrame(m)
new_df['labels'] = data.values()
assert len(texts_embeddings) == len(list(data.values()))

In [16]:
new_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,375,376,377,378,379,380,381,382,383,labels
0,0.005807,0.026399,0.027092,0.030168,0.045864,0.016163,0.014566,0.056342,0.022835,0.050278,...,0.048605,-0.01259,0.02914,0.01415,0.007106,-0.018236,-0.071822,-0.05743,-0.105029,23
1,-0.00799,-0.0094,0.049066,0.031651,-0.031924,-0.059153,-0.060794,0.024453,0.045883,-0.055194,...,0.039394,0.008107,-0.029349,0.015276,0.058119,-0.098453,-0.003963,-0.064039,-0.035733,22
2,0.047834,0.091496,-0.019142,-0.086522,0.071321,0.009161,-0.036008,0.011389,0.043029,-0.002499,...,0.041924,0.101367,-0.066517,-0.004177,0.05467,-0.004989,-0.008006,-0.108028,-0.013265,24
3,0.045197,-0.067831,0.100727,-0.02453,0.073523,-0.03387,-0.013235,-0.048653,0.048653,-0.049849,...,0.013418,-0.028243,0.066845,-0.015295,0.062122,0.028504,0.021931,0.013035,0.018267,14
4,-0.063706,0.029877,-0.037372,-0.029128,0.018729,-0.05306,0.065565,0.063877,0.025667,-0.003778,...,-0.029619,0.027826,-0.056821,0.051828,0.070746,0.002912,0.08446,0.031658,-0.047956,7


In [17]:
#X_tr, X_eval = train_test_split(new_df, test_size=0.1)
X = new_df.iloc[:, 0:383] 
y = new_df['labels'].squeeze()
assert len(X) == len(y)
train_vec, test_vec, train_lab, test_lab = train_test_split(X, y , test_size=0.1)

In [18]:
assert len(test_vec) == len(test_lab)

In [19]:
# import string
# def preprocessing(line):
#     line = line.lower()
#     line = re.sub(r"[{}]".format(string.punctuation), " ", line)
#     return line

## Education 

In [64]:
SklearnClassifierModel = Union[LogisticRegression, CatBoostClassifier]
def create_model(model_name: str, model_params: dict) -> SklearnClassifierModel:
    """
    :param model_name: name of the scr to be created
    :param model_params: scr parameters according to sklearn documentation
    :return: Sklearn Classifier Model
    """
    if model_name == 'catboost':
        model = CatBoostClassifier(**model_params)
    elif model_name == 'logistic_regression':
        model = LogisticRegression(**model_params)
    else:
        print(f'scr {model_name} is not supported, you can use only knn or log_reg')
        raise ValueError

    return model

In [72]:
def train_model(model_name, model_params: dict, design_matrix, labels):
    """
    :param model_name: name of the scr to be trained (no suitable for catboost!)
    :param model_params: model initial params according to it's documentation
    :param design_matrix: feature vectorized matrix
    :param labels:
    :return: pre-trained Sklearn Classifier Model
    """
    model = create_model(model_name, model_params)
    model.fit(design_matrix, labels)
#     pipe = make_pipeline(StandardScaler(), model) 
#     pipe.fit(design_matrix, labels)
    return model

### Catboost

In [40]:
assert len(train_vec) == len(train_lab)

In [41]:
train_dataset = Pool(train_vec, train_lab)
test_dataset = Pool(test_vec, test_lab)

In [50]:
catboost_params = {
    'iterations': 30, 
    'loss_function': 'MultiClass',
    'train_dir': 'crossentropy',
    'allow_writing_files': False,
    'random_seed': 42,
    'task_type': "GPU",
    'eval_metric': 'Accuracy'
}

In [51]:
model1 = create_model('catboost', catboost_params)

In [52]:
model1.fit(train_vec, 
          train_lab,
          eval_set=(test_vec, test_lab),
          verbose = True,
         )


Learning rate set to 0.5
0:	learn: 0.2436127	test: 0.2452860	best: 0.2452860 (0)	total: 126ms	remaining: 3.66s
1:	learn: 0.3231442	test: 0.3235763	best: 0.3235763 (1)	total: 229ms	remaining: 3.2s
2:	learn: 0.3853692	test: 0.3857816	best: 0.3857816 (2)	total: 338ms	remaining: 3.04s
3:	learn: 0.4420520	test: 0.4379857	best: 0.4379857 (3)	total: 442ms	remaining: 2.88s
4:	learn: 0.4855833	test: 0.4835329	best: 0.4835329 (4)	total: 553ms	remaining: 2.77s
5:	learn: 0.5213109	test: 0.5184737	best: 0.5184737 (5)	total: 689ms	remaining: 2.75s
6:	learn: 0.5579126	test: 0.5542744	best: 0.5542744 (6)	total: 828ms	remaining: 2.72s
7:	learn: 0.5834015	test: 0.5811887	best: 0.5811887 (7)	total: 1s	remaining: 2.76s
8:	learn: 0.6083665	test: 0.6045356	best: 0.6045356 (8)	total: 1.17s	remaining: 2.72s
9:	learn: 0.6321035	test: 0.6260033	best: 0.6260033 (9)	total: 1.32s	remaining: 2.65s
10:	learn: 0.6510945	test: 0.6451140	best: 0.6451140 (10)	total: 1.49s	remaining: 2.57s
11:	learn: 0.6692467	test: 0.66

<catboost.core.CatBoostClassifier at 0x7f1fb4440090>

### Logistic Regression

In [76]:
log_reg_params = {
    'C': 0.85,
    'max_iter': 120,
    'penalty': 'l2'
}
lr_classifier = train_model('logistic_regression', log_reg_params, train_vec, train_lab)

In [77]:
lr_classifier.score(test_vec, test_lab)

0.9759842018091477

In [78]:
dump(lr_classifier, 'classification_model_logreg.joblib')

['classification_model_logreg.joblib']