In [None]:
!pip install nltk spacy
!python -m spacy download en_core_web_sm
!python -m spacy download ru_core_news_sm

import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import spacy
import pandas as pd
from collections import Counter

import re

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m110.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting ru-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.8.0/ru_core_news_sm-3.8.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m112.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installatio

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
text_en = "ОТЗЫВЫ ПАРИЖ.txt"
text_ru = "ОТЗЫВЫ ВЛАДИВОСТОК.txt"

# Чтение файла
with open(text_en, 'r', encoding='utf-8') as file:
    text_en = file.read()

print(f"Файл успешно прочитан. Длина текста: {len(text_en)} символов")

# Чтение файла
with open(text_ru, 'r', encoding='utf-8') as file:
    text_ru = file.read()

print(f"Файл успешно прочитан. Длина текста: {len(text_ru)} символов")

Файл успешно прочитан. Длина текста: 31131 символов
Файл успешно прочитан. Длина текста: 27708 символов


In [None]:
russian_stopwords = set(stopwords.words('russian'))
english_stopwords = set(stopwords.words('english'))

def normalize_text(text):
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    text = re.sub(r'\([^)]*\)', ' ', text)
    text = text.lower()
    text = text.replace('ё', 'е')
    text = re.sub(r'(\w)-(\w)', r'\1HYPHEN\2', text)
    text = re.sub(r'[^\w\sHYPHEN]', ' ', text)
    text = re.sub(r'HYPHEN', '-', text)

    text = re.sub(r'\s+', ' ', text).strip()

    return text


normalized_text_en = normalize_text(text_en)
normalized_text_ru = normalize_text(text_ru)

In [None]:
nlp_en = spacy.load("en_core_web_sm")
nlp_ru = spacy.load("ru_core_news_sm")

In [None]:
def extract_entities(text, nlp, lang):
  doc = nlp(text)
  entities = [(ent.text, ent.label_) for ent in doc.ents
              if ent.label_ in {"ORG", "GPE", "LOC", "DATE", "TIME", "MONEY", "FAC", }
             ]

  return entities

en_entities = extract_entities(normalized_text_en, nlp_en, "en")
ru_entities = extract_entities(normalized_text_ru, nlp_ru, "ru")

only_en = list(set(en_entities))
only_ru = list(set(ru_entities))

en_entities_df = pd.DataFrame(only_en, columns = ["Слово", "Сущность"])
ru_entities_df = pd.DataFrame(only_ru, columns = ["Слово", "Сущность"])

freq_en = Counter([ent[1] for ent in en_entities])
freq_ru = Counter([ent[1] for ent in ru_entities])

freq_en_df = pd.DataFrame(freq_en.items(),  columns = ["Сущность", "Частотность"])
freq_ru_df = pd.DataFrame(freq_ru.items(),  columns = ["Сущность", "Частотность"])

print("Английские сущности:")
print(en_entities_df)
print("\nРусские сущности:")
print(ru_entities_df)
print("\nЧастотность английских сущностей:")
print(freq_en_df)
print("\nЧастотность русских сущностей:")
print(freq_ru_df)

Английские сущности:
                             Слово Сущность
0             the next four nights     DATE
1                       8 year old     DATE
2             the christmas season     DATE
3                         6 second     TIME
4               luxembourg gardens      ORG
5                            paris      GPE
6                 bristol paris 23      ORG
7                  rue de faubourg      FAC
8             about a three minute     TIME
9                     the last day     DATE
10                       six-night     TIME
11                    only minutes     TIME
12                     last minute     TIME
13                           a day     DATE
14                     the morning     TIME
15                  9 30 both days     DATE
16        foundation louis vuitton      ORG
17                          nights     TIME
18                            2025     DATE
19                          hilton      GPE
20                            2020     DATE
21         