In [None]:
!pip install -q top2vec
!pip install -q umap-learn[plot]
!pip install -q sentence_transformers
!pip install -q top2vec[sentence_encoders]


In [None]:
#load the needed packages and modules 
import pandas as pd
import numpy as np
from datetime import date
import datetime as dt
import IPython.display as display

In [None]:
#Загрузка данных в pandas
train_data_path = "/content/drive/MyDrive/Books recommendations system /train_transactions_extended.csv"
users_data_path = "/content/drive/MyDrive/Books recommendations system /users.csv"
items_data_path = "/content/drive/MyDrive/Books recommendations system /items.csv"
#load the data and make sure to change the path for your local directory 
train_data = pd.read_csv(train_data_path, delimiter=";")
users_data = pd.read_csv(users_data_path, delimiter=";")
items_data = pd.read_csv(items_data_path, delimiter=";")

In [None]:
#Первые пять строк
nl = '\n'
print (f"train data sample {nl}{train_data.head()}")
print (f"users data sample {nl}{users_data.head()}")
print (f"items data sample {nl}{items_data.head()}")

In [None]:
train_data.info()

In [None]:
train_data['type'].unique()

In [None]:
train_data['type'] = train_data['type'].replace(to_replace = "скачка", value = "скачивание")

In [None]:
print (train_data.isnull())
print (train_data.duplicated().sum()) #sum of missing values

In [None]:
#Агрегирование читательский билет-ids документов
agg_func_unique= {'sys_numb': ['unique']}
maped_df["maped_id"] = train_data.groupby(["chb"]).agg(agg_func_unique)
maped_df.head(10)

In [None]:
#Подготовка данных для тренировки
items_subset = items_data [["sys_numb","title","author"]]
train_title_list = train_subset['title'].tolist()
train_title_id_list = train_subset['sys_numb'].tolist()

In [None]:
#Тренировка doc2vec модели
%%time
from top2vec import Top2Vec
model = Top2Vec(documents=train_title_list,document_ids=train_title_id_list, min_count=15,ngram_vocab=True,embedding_model='doc2vec', speed="deep-learn",keep_documents=True,use_corpus_file=True, workers=2, verbose=True)
display.Audio(url="https://ssl.gstatic.com/dictionary/static/pronunciation/2019-10-21/audio/do/done_en_us_1.mp3", autoplay=True)

In [None]:
#Тренировка universal-sentence-encoder-multilingual модели
%%time
from top2vec import Top2Vec
model = Top2Vec(documents=train_title_list,document_ids=train_title_id_list, min_count=15,ngram_vocab=True,embedding_model='universal-sentence-encoder-multilingual', speed="deep-learn",keep_documents=True,use_corpus_file=True, workers=2, verbose=True)
display.Audio(url="https://ssl.gstatic.com/dictionary/static/pronunciation/2019-10-21/audio/do/done_en_us_1.mp3", autoplay=True)

In [None]:
#Сохранение модели
num_topics = model.get_num_topics()
model.save(f"/content/drive/MyDrive/teploteh_corpus/top2vec_biblio_lern_doc2vec_t{num_topics}.dump")

In [None]:
#Редуцирование модели
%%time
print (model.get_num_topics())
#reduce_num_topics=input()
reduce_num_topics=1000
print(f"Количество топиков до редукции {model.get_num_topics()}")
model.hierarchical_topic_reduction(int(reduce_num_topics))
print(f"Koличество топиков после редукции {model.get_num_topics(reduced=True)}")
model.save(f"/content/drive/MyDrive/teploteh_corpus/top2vec_biblio_doc2vec_reduced_tr{int(reduce_num_topics)}.dump")
print ("reduced model saved")
display.Audio(url="https://ssl.gstatic.com/dictionary/static/pronunciation/2019-10-21/audio/do/done_en_us_1.mp3", autoplay=True)

In [None]:
#Загрузка модели
from top2vec import Top2Vec
model = Top2Vec.load("/content/drive/MyDrive/teploteh_corpus/top2vec_biblio_doc2vec_reduced_tr1000.dump")

In [None]:
#Поиск документов по семантической связи
model.search_documents_by_documents(doc_ids=["RSL01004206702", "RSL01000769304", "RSL01004211574"], num_docs=20, doc_ids_neg=None, return_documents=True, use_index=False, ef=None)

In [None]:
#Поиск топиков по ключевым словам
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["медицина","зоология"], num_topics=5,reduced=False)
for topic in topic_nums:
    model.generate_topic_wordcloud(topic)

In [None]:
#Поиск похожих слов
words, word_scores = model.similar_words(keywords=["космос"], keywords_neg=[], num_words=20)
for word, score in zip(words, word_scores):
    print(f"{word} {score}")

In [None]:
#get solutions
solution = []
for chb, row in maped_df.iterrows():
  #print(chb)
  for sys_numb in row:
    pr = pred_by_vector (sys_numb)
  solution.append([chb,pr[2]])
print (solution)

Изучение модели


In [None]:
#Использование UMAP (равномерная аппроксимация многообразия и проекция для уменьшения размерности)
import pandas as pd
import umap
_umap = umap.UMAP(n_components=15, n_neighbors=5, min_dist=0.1, metric='cosine',verbose=True)
umapdf = pd.DataFrame(_umap.fit_transform(vecdf), index=vecdf.index)
print(umapdf.info())

In [None]:
#Использование HDBSCAN (алгоритм кластеризации)
import hdbscan
_hdbscan = hdbscan.HDBSCAN(min_cluster_size=5)
clusters = _hdbscan.fit_predict(umapdf)
unique_clusters = np.unique(clusters)

print('%d clusters...' % len(unique_clusters))
print('Clusters: %s...' % unique_clusters[:100])

In [None]:
from plotly import graph_objs
data = [dict(type='histogram', x=clusters)]
layout=dict(width=1000, height=300, margin=dict(l=0, t=0, r=0, b=0))
figure = graph_objs.Figure(data=data, layout=layout)
figure.show()

In [None]:
#Визуализация кластеров
from plotly.subplots import make_subplots
ALPHA = 0.5

figure = make_subplots(rows=1, cols=2)
for cluster in unique_clusters:
  plotdf =umapdf[clusters == cluster]
  figure.add_trace(dict(type='scattergl', mode='markers', x=plotdf[0], y=plotdf[1],
                        marker=dict(opacity=ALPHA), name='cluster#%d' % cluster, text=plotdf.index),
                   1, 1)
  if cluster != -1:
    figure.add_trace(dict(type='scattergl', mode='markers', x=plotdf[0], y=plotdf[1],
                          marker=dict(opacity=ALPHA), name='cluster#%d' % cluster, text=plotdf.index),
                     1, 2)
figure.update_layout(width=1000, height=400, showlegend=False, margin=dict(l=0, t=0, r=0, b=0))
figure.show()