In [7]:
!pip install spacy scipy deep_translator
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [8]:
import spacy
from deep_translator import GoogleTranslator
from scipy.spatial.distance import cosine
import sys

from google.colab import drive
drive.mount('/content/drive')

FILE_PATH = '/content/drive/MyDrive/Project/Texts.txt'

MODEL_NAME = "en_core_web_lg"
TOP_N = 5
NOISE_LABELS = ["CARDINAL", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL"]

try:
    nlp = spacy.load(MODEL_NAME)
    print(f"Модель '{MODEL_NAME}' загружена.")
except OSError:
    print("Неправильное имя модели.")
    sys.exit(1)

try:
    with open(FILE_PATH, 'r', encoding='utf-8') as f:
        TEST_TEXT = f.read()
    print(f"Файл прочитан. Длина текста: {len(TEST_TEXT)} символов.")
    nlp.max_length = len(TEST_TEXT) + 100
except FileNotFoundError:
    print(f"Файл {FILE_PATH} не найден.")
    sys.exit(1)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Модель 'en_core_web_lg' загружена.
Файл прочитан. Длина текста: 1879551 символов.


In [None]:

print("Анализ текста...")
doc = nlp(TEST_TEXT)

entities_db = {}
for ent in doc.ents:
    if ent.label_ in NOISE_LABELS:
        continue

    if ent.has_vector and len(ent.text) > 2:
        if ent.text not in entities_db:
            entities_db[ent.text] = {
                "vector": ent.vector,
                "label": ent.label_
            }

print(f"Анализ завершен. Уникальных сущностей: {len(entities_db)}")
print("-" * 50)

while True:
    user_input = input("\nВведите английское слово для поиска (или 'exit'): ").strip()

    if user_input.lower() == 'exit':
        print("Поиск окончен")
        break

    if not user_input:
        continue

    user_doc = nlp(user_input)
    if not user_doc.vector_norm:
        print(f"В базе модели отсутствует слово'{user_input}'")
        continue

    user_vector = user_doc.vector

    results = []
    for text, data in entities_db.items():
        score = 1 - cosine(user_vector, data["vector"])
        results.append({
            "text": text,
            "label": data["label"],
            "score": score
        })

    results.sort(key=lambda x: x["score"], reverse=True)

    print(f"\nБлижайшие сущности к слову '{user_input}':")
    for item in results[:TOP_N]:
        try:
            translation = GoogleTranslator(source='en', target='ru').translate(item['text'])
        except:
            translation = "[ошибка]"

        print(f"  - {item['text']} ({translation}) | Тип: {item['label']} | Сходство: {item['score']:.4f}")

Анализ текста...
Анализ завершен. Уникальных сущностей: 2001
--------------------------------------------------
