In [107]:
%pip install numpy scikit-learn spacy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [108]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from pathlib import Path

In [152]:
class Document:
    text = ""
    tokens = ""
    tokens_tagged = []
    _stopwords = []

    def __init__(self, text="", user_stopwords=stopwords.words("english")):
        self.text = text

        self._stopwords = user_stopwords

        self.tokens = word_tokenize(text)
        self.tokens = [w for w in self.tokens if not w in self._stopwords]
        self.tokens = [w for w in self.tokens if w.isalpha()]


    def vectorize(self):
        count_vectorizer = CountVectorizer(stop_words=self._stopwords)
        word_matrix = count_vectorizer.fit_transform(self.tokens)
        return word_matrix.todense()


    def cos_similarity(self, *args):
        count_vectorizer = CountVectorizer(stop_words=self._stopwords)
        documents = [self.text, *args]
        documents_matrix_sp = count_vectorizer.fit_transform(documents)
        documents_matrix = documents_matrix_sp.todense()

        return cosine_similarity(X=np.asarray(documents_matrix), Y=None)





In [153]:
stopwords_ru = stopwords.words("russian")
stopwords_ru.extend(["это"])

In [154]:
text = \
"""
Ехал Грека через реку. 
Видит Грека в реке рак. 
Сунул Грека руку в реку. 
Рак за руку Греку цап!
"""

text2 = \
"""
Ехал грека через реку
Видит грека в реке гроб
И коли век у человека
Лишь один, судьбины рок
Сунул грека руку в реку
Реку боли, реку зла
Бафомета видит грека
То скелета, то козла
"""
# ​pyrokinesis - докучные сказки докучные сказки докучные

In [155]:
dt = Document(text, user_stopwords=stopwords_ru)

In [156]:
dt.vectorize()

matrix([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=int64)

In [157]:
display(dt.cos_similarity(text2))

array([[1.        , 0.72057669],
       [0.72057669, 1.        ]])

In [178]:
p = Path("./")
book_paths = p.glob("**/*.txt")

books = {}

for bp in book_paths:
    books[bp.name] = bp.read_text(encoding="utf-8-sig")


# Косинусное сходство между текстами Пелевина

In [179]:
ft = list(books.keys())[0]
d = Document(books.pop(ft))

cos_similiarity_df = pd.DataFrame(
    d.cos_similarity(*books.values()),
    columns=[ft, *books.keys()],
    index=[ft, *books.keys()]
)

display(cos_similiarity_df)

Unnamed: 0,Chapaev_and_the_Void.txt,Empire_V.txt,Generation_P.txt,KGBT_plus.txt,Omon_Ra.txt,Secret_Views_of_Mount_Fuji.txt,Yellow_Arrow.txt
Chapaev_and_the_Void.txt,1.0,0.953716,0.926302,0.93007,0.947532,0.918311,0.885132
Empire_V.txt,0.953716,1.0,0.91053,0.96154,0.931368,0.943551,0.859742
Generation_P.txt,0.926302,0.91053,1.0,0.890949,0.911681,0.884061,0.857258
KGBT_plus.txt,0.93007,0.96154,0.890949,1.0,0.925478,0.952613,0.833585
Omon_Ra.txt,0.947532,0.931368,0.911681,0.925478,1.0,0.905192,0.864902
Secret_Views_of_Mount_Fuji.txt,0.918311,0.943551,0.884061,0.952613,0.905192,1.0,0.834168
Yellow_Arrow.txt,0.885132,0.859742,0.857258,0.833585,0.864902,0.834168,1.0


Самое большое сходство — между "Ампир V" и "КГБТ+"

In [225]:
biggest_sim = {"titles" : ("", ""), "value" : 0}


for index, r in cos_similiarity_df.iterrows():
    for column, v in r.items():
        if biggest_sim["value"] < v and v < 0.9999999999: # Not 1.0 to work around raounding errors
            biggest_sim["value"] = v
            biggest_sim["titles"] = (index, column)

display(biggest_sim)

{'titles': ('Empire_V.txt', 'KGBT_plus.txt'), 'value': 0.9615402691661951}