In [None]:
from pyspark import SparkConf, SparkContext, SparkSession
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

# Загрузка стоп-слов и инициализация стеммера
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
# Настройка Spark
spark = SparkSession.builder\
        .master("local[*]")\
        .appName("TextAnalyzer")\
        .getOrCreate()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
sc = spark.sparkContext
# Загрузка текста
text_file = sc.textFile("/content/drive/MyDrive/Colab Notebooks/pg1399.txt")

# 1. Очистка текста
def clean_text(line):
    line = re.sub(r'[^\w\s]', '', line)
    words = line.lower().split()
    return [word for word in words if word not in stop_words]

cleaned_rdd = text_file.flatMap(clean_text)

# 2. Лямбда-функция для подсчета количества слов
word_counts = cleaned_rdd.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
# Получение и сортировка по частоте
sorted_word_counts = word_counts.collect()
sorted_word_counts.sort(key=lambda x: x[1], reverse=True)

# 3. Вывод тoп 50 наиболее распространенных слов
print("Top 50 most common words:")
for word, count in sorted_word_counts[:50]:
    print(f"{word}: {count}")

# 3. Вывод тoп 50 наименее распространенных слов
print("\nTop 50 least common words:")
for word, count in sorted_word_counts[-50:]:
    print(f"{word}: {count}")


Top 50 most common words:
said: 2725
levin: 1512
one: 1157
would: 1044
could: 971
vronsky: 769
anna: 739
go: 682
well: 676
know: 671
come: 667
went: 661
alexey: 627
see: 612
kitty: 596
time: 564
thought: 563
felt: 553
dont: 550
stepan: 548
eyes: 547
face: 539
yes: 536
nothing: 530
alexandrovitch: 524
man: 522
though: 521
like: 516
say: 513
arkadyevitch: 510
little: 460
life: 451
without: 429
something: 427
away: 427
im: 425
love: 425
must: 425
still: 425
came: 421
saw: 421
going: 408
never: 407
made: 402
old: 394
knew: 390
hand: 385
long: 369
began: 365
wife: 364

Top 50 least common words:
processing: 1
hypertext: 1
exporting: 1
periodic: 1
notifies: 1
discontinue: 1
1f1: 1
employees: 1
infringement: 1
damaged: 1
virus: 1
codes: 1
disclaim: 1
distributor: 1
1f4: 1
merchantability: 1
fitness: 1
exclusion: 1
maximum: 1
invalidity: 1
void: 1
indemnity: 1
employee: 1
readable: 1
gutenbergtms: 1
goals: 1
ensuring: 1
2001: 1
identification: 1
deductible: 1
809: 1
north: 1
1500: 1
ut: 1
8411

In [None]:
# Функция для стемминга слов
def stem_word(word):
    return stemmer.stem(word)

# 4.Стемминг слов
stemmed_rdd = cleaned_rdd.map(stem_word)
# Повторный подсчет слов после стемминга
stemmed_word_counts = stemmed_rdd.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)

# Сортировка
sorted_stemmed_word_counts = stemmed_word_counts.collect()
sorted_stemmed_word_counts.sort(key=lambda x: x[1], reverse=True)

# 5. Вывод топ 50 наиболее распространенных слов после стемминга
print("\nTop 50 most common stemmed words:")
for word, count in sorted_stemmed_word_counts[:50]:
    print(f"{word}: {count}")

# 5. Вывод топ 50 наименее распространенных слов после стемминга
print("\nTop 50 least common stemmed words:")
for word, count in sorted_stemmed_word_counts[-50:]:
    print(f"{word}: {count}")

# Остановка контекста Spark
sc.stop()



Top 50 most common stemmed words:
said: 2725
levin: 1624
one: 1239
go: 1090
would: 1044
look: 999
could: 971
come: 887
vronski: 857
anna: 821
know: 809
see: 783
say: 714
like: 696
well: 678
kitti: 670
went: 661
thought: 658
time: 650
hand: 645
smile: 631
alexey: 630
face: 588
love: 586
alexandrovitch: 571
eye: 567
feel: 557
man: 556
felt: 553
dont: 550
stepan: 548
arkadyevitch: 547
ye: 536
noth: 532
though: 521
ask: 491
think: 488
get: 485
even: 481
talk: 481
littl: 460
life: 453
want: 453
answer: 432
still: 431
long: 429
without: 429
someth: 427
away: 427
day: 427

Top 50 least common stemmed words:
buddhist: 1
whichi: 1
buddhistswhat: 1
birchtre: 1
meridian: 1
starlight: 1
tactlessli: 1
renam: 1
trademarkcopyright: 1
1d: 1
download: 1
reus: 1
1e2: 1
unlink: 1
detach: 1
1e5: 1
1e6: 1
binari: 1
nonproprietari: 1
proprietari: 1
hypertext: 1
discontinu: 1
1f1: 1
identifi: 1
inaccur: 1
transcript: 1
infring: 1
1f4: 1
violat: 1
maximum: 1
unenforc: 1
void: 1
indemn: 1
modif: 1
readabl: 1
