# Citation project

In [None]:
!pip install sentence_transformers

In [None]:
import pickle
import re
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

## Data Preparation

In [None]:
path = '../input/cndbv13/test1.json'
df = pd.read_json(path, lines=True, chunksize = 450000)
df = next(iter(df))


In [None]:
with open('../input/labels-idx/idx_labels.pickle', 'rb') as idx_f:
    data_idx = pickle.load(idx_f)

In [None]:
data = dict(data_idx)
list(data.keys())[0:10]

In [None]:
#prepare data
texts_with_empty = np.array(df['abstract'])
print(type(texts_with_empty))
texts = texts_with_empty[list(data.keys())]
assert len(texts) == len(data_idx)

In [None]:
new_df = pd.DataFrame(zip(texts, list(data.values())), columns=['abstracts', 'idxs'])

In [None]:
new_df.head()

## Embedding

In [None]:
# model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
# tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
# model = model_class.from_pretrained(pretrained_weights)

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
abstr = list(new_df['abstracts'])

In [None]:
texts_embeddings = model.encode(abstr, batch_size=128, show_progress_bar=True, convert_to_tensor=False)

In [None]:
#tokenized = abstr.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))   # Очень долго!

In [None]:
m = np.matrix(texts_embeddings)

In [None]:
#new_df = pd.DataFrame(zip(texts_embeddings, list(data.values())), columns=['vectors', 'labels'])  #Очень долго страдал, написав эту дичь
new_df = pd.DataFrame(m)
new_df['labels'] = data.values()
assert len(texts_embeddings) == len(list(data.values()))

In [None]:
new_df.head()

In [None]:
#X_tr, X_eval = train_test_split(new_df, test_size=0.1)
X = new_df.iloc[:, 0:383] 
y = new_df['labels'].squeeze()
assert len(X) == len(y)
train_vec, test_vec, train_lab, test_lab = train_test_split(X, y , test_size=0.1)

In [None]:
assert len(test_vec) == len(test_lab)

In [None]:
# import string
# def preprocessing(line):
#     line = line.lower()
#     line = re.sub(r"[{}]".format(string.punctuation), " ", line)
#     return line

## Education 

### Catboost

In [None]:
from catboost import CatBoostClassifier, Pool

In [None]:
assert len(train_vec) == len(train_lab)

In [None]:
train_dataset = Pool(train_vec, train_lab)
test_dataset = Pool(test_vec, test_lab)

In [None]:
model_params = {
    'iterations': 20, 
    'loss_function': 'MultiClass',
    'train_dir': 'crossentropy',
    'allow_writing_files': False,
    'random_seed': 42,
    'task_type': "GPU",
    'eval_metric': 'Accuracy'
}

In [None]:
model = CatBoostClassifier(**model_params)

In [None]:
model.fit(train_vec, 
          train_lab,
          eval_set=(test_vec, test_lab),
          verbose = True,
          plot = True
         )


### Logistic Regression

In [None]:
lr_clf = LogisticRegression()

In [None]:
lr_clf.fit(train_vec, train_lab)

In [None]:
lr_clf.score(test_vec, test_lab)