In [17]:
import pandas as pd, os
from tqdm import tqdm
import numpy as np

In [18]:
# считаем данные

df = pd.read_csv("sample-data.csv")

In [19]:
df

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."
...,...,...
495,496,Cap 2 bottoms - Cut loose from the maddening c...
496,497,Cap 2 crew - This crew takes the edge off fick...
497,498,All-time shell - No need to use that morning T...
498,499,All-wear cargo shorts - All-Wear Cargo Shorts ...


In [68]:
# выделим текст из колонки 'description'
train_text_df = pd.DataFrame({'text': df['description']})
train_text_df.head()

Unnamed: 0,text
0,Active classic boxers - There's a reason why o...
1,Active sport boxer briefs - Skinning up Glory ...
2,Active sport briefs - These superbreathable no...
3,"Alpine guide pants - Skin in, climb ice, switc..."
4,"Alpine wind jkt - On high ridges, steep ice an..."


In [21]:
# представим массив текстов в виде векторов с помощью TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', binary=True, max_features=25_000)
text_embeddings = tfidf.fit_transform( train_text_df['text'] ).toarray()

In [22]:
text_embeddings 

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [78]:
result_dict = {}

In [79]:
# Посчитаем косинусное расстояние между векторами
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarities = cosine_similarity(text_embeddings, text_embeddings)

# Определим морог значимости
threshold = 0.9

# Найдем похожие тексты
similar_texts_indices = np.where(cosine_similarities >= threshold)

# Выводим результат
for idx1, idx2 in zip(*similar_texts_indices):
    if idx1 != idx2:
        # with open(r'D:\python\collab\skillfactory\output.txt', 'a') as f:
        #    f.write(f"Text 1: {idx1} Text 2: {idx2} Cosine similarity: {cosine_similarities[idx1, idx2]}\n")
        
        if idx1 in result_dict:
            result_dict[idx1].append(idx2)
        else:
            result_dict[idx1] = [idx2]

In [80]:
# Индексы товаров с похожими описаниями, для описаний, векторизированных с помощью TF-IDF
result_dict

{3: [158],
 4: [307],
 26: [27, 451],
 27: [26, 451],
 31: [461, 462],
 34: [281],
 36: [480, 481],
 76: [360],
 81: [379],
 84: [196],
 106: [350],
 107: [108, 389],
 108: [107, 389],
 112: [113, 419],
 113: [112],
 117: [118, 349],
 118: [117, 349],
 140: [274],
 152: [153, 354, 366],
 153: [152, 354, 366],
 158: [3],
 178: [472],
 183: [437],
 187: [304],
 196: [84],
 199: [259, 264],
 203: [415],
 209: [363],
 211: [381],
 218: [290],
 221: [241],
 241: [221],
 242: [251],
 243: [252],
 251: [242],
 252: [243],
 254: [261],
 259: [199, 264],
 261: [254],
 262: [377],
 264: [199, 259],
 266: [385],
 267: [386],
 273: [326],
 274: [140],
 281: [34],
 285: [491],
 290: [218],
 292: [404],
 304: [187],
 307: [4],
 326: [273],
 349: [117, 118],
 350: [106],
 354: [152, 153, 366],
 360: [76],
 363: [209],
 366: [152, 153, 354],
 368: [369],
 369: [368],
 377: [262],
 378: [383],
 379: [81],
 381: [211],
 383: [378],
 385: [266],
 386: [267],
 389: [107, 108],
 390: [391],
 391: [390],
 4

In [29]:
# Получим векторы embeddings с помощью word2vec

import nltk
import gensim
import gensim.downloader

embeddings = gensim.downloader.load('word2vec-google-news-300')
nltk.download('stopwords')

docs_vectors = pd.DataFrame()
stopwords = nltk.corpus.stopwords.words('english')




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [69]:
docs_vectors = pd.DataFrame()
temp = pd.DataFrame()

In [70]:
# Исключим стоп слова, для каждого слова, для которого есть embedding - добавим вектор значений в коллекцию

for doc in train_text_df['text'].str.lower().str.replace('[^a-z ]', ''):
    temp = pd.DataFrame()
    for word in doc.split(' '):
        if word not in stopwords and word in embeddings:
            try:
                word_vec = embeddings[word]
                temp = temp.append(pd.Series(word_vec), ignore_index = True)
            except:
                pass
    # усредним значение векторов для одного документа
    doc_vector = temp.mean()
    if not doc_vector.isnull().values.any():
        docs_vectors = docs_vectors.append(doc_vector, ignore_index = True)


  for doc in train_text_df['text'].str.lower().str.replace('[^a-z ]', ''):
  temp = temp.append(pd.Series(word_vec), ignore_index = True)
  docs_vectors = docs_vectors.append(doc_vector, ignore_index = True)


In [73]:
text_embeddings_w2v = docs_vectors.values

In [81]:
result_dict_w2v = {}

In [82]:
# Рассчитаем косинусное расстояние
cosine_similarities = cosine_similarity(text_embeddings_w2v, text_embeddings_w2v)

# Определим порог значимости
threshold = 0.97

# Найдем похожие тексты
similar_texts_indices = np.where(cosine_similarities >= threshold)

# Выводим результат
for idx1, idx2 in zip(*similar_texts_indices):
    if idx1 != idx2:
       # with open(r'D:\python\collab\skillfactory\output_w2v.txt', 'a') as f:
       #    f.write(f"Text 1: {idx1} Text 2: {idx2} Cosine similarity: {cosine_similarities[idx1, idx2]}\n")
       
        if idx1 in result_dict_w2v:
            result_dict_w2v[idx1].append(idx2)
        else:
            result_dict_w2v[idx1] = [idx2]


In [83]:
# Индексы товаров с похожими описаниями, для описаний, векторизированных с помощью Word2Vec
result_dict_w2v

{3: [158],
 4: [307],
 7: [219],
 14: [15, 479, 483],
 15: [14],
 17: [170],
 18: [493],
 19: [339, 487],
 20: [171],
 21: [22, 173, 174, 358, 359, 496],
 22: [21, 174, 358, 359],
 23: [440, 442],
 24: [175],
 26: [27, 451],
 27: [26, 451],
 28: [453],
 31: [461, 462],
 34: [281],
 36: [480, 481],
 41: [420],
 45: [408],
 48: [133],
 49: [438],
 51: [443],
 52: [341],
 57: [62, 63, 64, 431, 432],
 62: [57, 63, 64, 431, 432],
 63: [57, 62, 64, 431, 432],
 64: [57, 62, 63, 431, 432],
 68: [237, 318],
 71: [333],
 72: [238, 334],
 74: [76, 360],
 75: [124],
 76: [74, 360],
 81: [379],
 83: [393],
 84: [196],
 86: [87, 88, 89, 199, 200, 201, 259, 264, 265, 268, 406],
 87: [86, 88, 89, 199, 200, 259, 264, 265, 268, 405, 406],
 88: [86, 87, 89, 201, 265, 268, 405, 406],
 89: [86, 87, 88, 201, 265, 268, 405, 406],
 95: [209, 280],
 96: [208],
 105: [263],
 106: [350],
 107: [108, 389],
 108: [107, 389],
 109: [390, 391],
 112: [113, 397, 419],
 113: [112, 397, 419],
 116: [244, 476],
 117: [1