In [6]:
import numpy as np 
import pandas as pd 
from tqdm.notebook import tqdm
import json
from bertopic import BERTopic
from umap import UMAP
import random 
# custom scripts
from text_preprocessing import text_preprocessing
%matplotlib inline

# Num of lines per json chunks:  
* num of lines in test5.json is 1354309
* num of lines in test4.json is 1000000
* num of lines in test3.json is 1000000
* num of lines in test1.json is 1000000
* num of lines in test2.json is 1000000


In [10]:
# path = 'data/articles_data/'
# chunks = os.listdir(path)
# all_keys = set()
# count = 0
# for elem in chunks:
#     with open(path + elem, 'rb') as f:
#         cnt = 0
#         for _ in range(10**7):
#             try:
#                 all_keys.update(json.loads(next(f)))
#                 cnt += 1
#             except StopIteration:
#                 print(f'num of lines in {elem} is {cnt}')
#                 break
# print(all_keys)

# Abstract preprocessing

In [11]:
# preproc in the same way
# for idx, paper in enumerate(tqdm(abstracts)):
#     abstracts[idx][1] = text_preprocessing(paper[1])

# Model training

In [2]:
# already saved proprocessed texts
abstracts = np.load('all_data/preprocessed_data_2015_abstracts.npy', allow_pickle = True).item()

In [14]:
idx = list(abstracts.keys())
random.shuffle(idx)

In [16]:
train_idx = idx[: len(idx) // 2]
test_idx = idx[len(idx) // 2:]

In [20]:
train_data = []
for key in train_idx:
    train_data.append(abstracts[key])

In [None]:
# for reproducibility 
# https://maartengr.github.io/BERTopic/faq.html#why-are-the-results-not-consistent-between-runs
umap_model = UMAP(n_neighbors = 15, 
                  n_components = 5, 
                  min_dist = 0.0,
                  metric = 'cosine', 
                  random_state = 42)

model = BERTopic(language = "english",
                 umap_model = umap_model,
                 verbose = True)
topics, probs = model.fit_transform(train_data)
topics_info = model.get_topic_info();

Batches:   0%|          | 0/12156 [00:00<?, ?it/s]

In [None]:
topics_info

In [None]:
model.save('bert_model_100k_new')

In [None]:
np.save("all_data/train_idx.npy", train_idx)
np.save("all_data/test_idx.npy", test_idx)

# Example of the model inference

In [3]:
# train_idx = np.load("all_data/train_idx.npy")
test_idx = np.load("all_data/test_idx.npy")

In [4]:
test_idx = test_idx.tolist()

In [5]:
test_data = {}
cnt = 0
for key in test_idx:
    if key == "nan":
        continue
    test_data[key] = abstracts[key]

In [7]:
np.save('all_data/test_articles.npy', test_data)

In [12]:
tt = np.load('all_data/scored_train_data_2015.npy', allow_pickle=True)

In [None]:
test_data.items()[0]

In [7]:
bert_model = BERTopic.load('bert_model_350k_2015_new')

In [9]:
data = np.load("all_data/test_articles.npy", allow_pickle=True).item()

In [14]:
keys = []
values = []
for key, value in data.items():
    keys.append(key)
    values.append(value)

In [16]:
res = bert_model.transform(values[:10])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
res

([85, -1, -1, 130, -1, 0, -1, 294, -1, 302],
 array([1.        , 0.        , 0.        , 0.65964381, 0.        ,
        1.        , 0.        , 0.55053165, 0.        , 0.72066497]))

In [25]:
list(zip(res[0], res[1]))

[(85, 1.0),
 (-1, 0.0),
 (-1, 0.0),
 (130, 0.6596438064660028),
 (-1, 0.0),
 (0, 1.0),
 (-1, 0.0),
 (294, 0.5505316454281876),
 (-1, 0.0),
 (302, 0.7206649735701612)]

In [3]:
topics_info = bert_model.get_topic_info()

In [4]:
topics_info

Unnamed: 0,Topic,Count,Name
0,-1,191566,-1_student_social_datum_user
1,0,3039,0_dnn_cnn_cifar_deep
2,1,2054,1_indoor_localization_positioning_rss
3,2,1745,2_quantum_qubit_entanglement_classical
4,3,1411,3_convex_gradient_proximal_convergence
...,...,...,...
2517,2516,10,2516_rem_kriging_krige_interpolation
2518,2517,10,2517_evapotranspiration_seb_et_arid
2519,2518,10,2518_tensor_nonnegative_odeco_symmetric
2520,2519,10,2519_carton_sheet_backordere_rework


In [7]:
# upload raw abstracts
test_data = np.load('all_data/abstracts.npy', allow_pickle = True)[10**6 : 10**6 + 20]
# preprocess and score on the go
preds = {}
for idx, paper in tqdm(enumerate(test_data)):
    preprocesed_abstract = text_preprocessing(paper[1])
    topics, prob = bert_model.transform(preprocesed_abstract)
    print(prob)
    preds[paper[0]] = topics_info[topics_info['Topic'] == topics[0]]['Name'].iloc[0]
    
for key, value in preds.items():
    print(key, ' : ', value)

0it [00:00, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[1.]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.8893339]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.91513623]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.18314955]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[1.]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.80538597]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.97741641]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.67504664]
53e9b06fb7602d9703ad307e  :  -1_graph_problem_design_application
53e9b06fb7602d9703ad3097  :  -1_graph_problem_design_application
53e9b06fb7602d9703ad30c6  :  143_ecg_fetal_heart_qrs
53e9b06fb7602d9703ad30dc  :  -1_graph_problem_design_application
53e9b06fb7602d9703ad3112  :  -1_graph_problem_design_application
53e9b06fb7602d9703ad3125  :  -1_graph_problem_design_application
53e9b06fb7602d9703ad3155  :  -1_graph_problem_design_application
53e9b06fb7602d9703ad31ac  :  -1_graph_problem_design_application
53e9b06fb7602d9703ad31b7  :  604_option_price_american_martingale
53e9b06fb7602d9703ad31e4  :  74_painting_sketch_stroke_paint
53e9b06fb7602d9703ad31f5  :  -1_graph_problem_design_application
53e9b06fb7602d9703ad320f  :  143_ecg_fetal_heart_qrs
53e9b06fb7602d9703ad322c  :  19_grid_resource_job_scheduling
53e9b06fb7602d9703ad326a  :  46_testing_test_suite_coverage
53e9b06fb7602d9703ad326f  :  -1_graph_problem_design_application
53e9b06fb7602d9703ad3280  :  -1_graph_problem_de

P.S. The first number in topics defines the class, so you can take it from there.

# Save scored ids and their labels

In [10]:
for idx, elem in enumerate(bert_model.topics_[:10]):
    print(idx, elem)

0 2040
1 -1
2 1266
3 -1
4 2191
5 218
6 -1
7 -1
8 -1
9 -1


In [13]:
topics_dict = dict(zip(bert_model.get_topic_info()['Topic'], bert_model.get_topic_info()['Name']))


In [1]:
tt = {1: 1, 2: 3}

In [4]:
for elem in tt.items():
    print(elem)

(1, 1)
(2, 3)


In [75]:
topics_dict = dict(zip(bert_model.get_topic_info()['Topic'], bert_model.get_topic_info()['Name']))
result = []
for idx, elem in enumerate(bert_model.topics_):
    if train_idx[idx] == "nan":
        continue
    result.append([train_idx[idx], topics_dict[elem]])
result = np.array(result)
np.save('all_data/scored_train_data_2015.npy', result)

In [76]:
t = np.load("all_data/scored_train_data_2015.npy")

# Finally save valid papers' ids

In [47]:
abst = np.load('all_data/abstracts.npy', allow_pickle = True)
scored_abst = np.load('all_data/preprocessed_abstracts_100k.npy', allow_pickle = True)

In [48]:
valid_id = abst[:, 0].tolist()

with open('all_data/valid_id.json', 'w') as f:
    f.write(json.dumps({'id': valid_id}))
    
data = {}
with open('all_data/valid_id.json', 'rb') as f:
    data = json.loads(next(f))

In [49]:
len(data['id'])

3010729

In [50]:
scores_id = scored_abst[:, 0].tolist()

with open('all_data/scored_id.json', 'w') as f:
    f.write(json.dumps({'id': scores_id}))
    
scored_data = {}
with open('all_data/scored_id.json', 'rb') as f:
    scored_data = json.loads(next(f))

In [51]:
len(scored_data['id'])

100000