In [1]:
import re
from math import log

In [2]:
def bigram(x):
    for i in range(len(x)-1):
        yield x[i] + '_' + x[i+1]

## Read

In [3]:
# считываем датасет
en_articles_part = sc.textFile("/data/wiki/en_articles_part")

In [4]:
# делаем из en_articles_part датасет "ключ-значение"
splitted_articles = en_articles_part.map(lambda x: (x.split('\t')[0], x.split('\t')[1]))

In [5]:
# считываем стоп-слова
stop_words = sc.textFile("/data/wiki/stop_words_en-xpo6.txt")

In [6]:
stop_words = stop_words.collect()

In [7]:
stop_words[:5]

[u'a', u'about', u'above', u'across', u'after']

## Задача #1: народные биграммы.

In [46]:
# оставляем в текстах статей только буквы
cleaned_articles = splitted_articles.map(lambda x: re.sub('[^a-z0-9]', ' ', x[1].lower()))

In [47]:
# делаем списки из слов (делим по пробелам)
articles_to_list = cleaned_articles.map(lambda x: [word.strip() for word in x.split(' ')])\
                                   .map(lambda x: [word for word in x if word])

In [48]:
# делаем биграммы
bigrams = articles_to_list.map(lambda x: list(bigram(x)))

In [50]:
# берем биграммы, начинающиеся со слова narodnaya
filtered_bigrams = bigrams.flatMap(lambda x: [word for word in x if word.startswith('narodnaya')])

In [51]:
filtered_bigrams.take(5)

[u'narodnaya_volya',
 u'narodnaya_volya',
 u'narodnaya_volya',
 u'narodnaya_volya',
 u'narodnaya_volya']

In [52]:
# подсчитываем число вхождений биграмм 
grouped_bigrams = filtered_bigrams.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).collect()

In [53]:
grouped_bigrams

[(u'narodnaya_gazeta', 1), (u'narodnaya_volya', 9)]

In [54]:
# вывод в лексикографическом порядке
for key, value in sorted(grouped_bigrams, key=lambda x: x[0]):
    print key, '\t', value 

narodnaya_gazeta 	1
narodnaya_volya 	9


# Задача #2: коллокации

### 2.1 Подготовка RDD

In [55]:
# бродкастим словарь со стоп-словами
br_stop_words = sc.broadcast(stop_words)

In [56]:
# удаляем стоп-слова из списков слов, полученных из каждой статьи
filtered_lists = articles_to_list.map(lambda x: [word for word in x if word not in stop_words])

In [57]:
# получаем биграммы из отфильтрованного списка слов
upd_bigrams = filtered_lists.flatMap(lambda x: list(bigram(x)))

In [58]:
# подсчитываем число вхождений биграмм 
grouped_bigrams = upd_bigrams.map(lambda x: (x, 1))\
                             .reduceByKey(lambda x, y: x + y)

In [59]:
# берем только те биграммы, которые встречались более 500 раз 
frequent_bigrams = grouped_bigrams.filter(lambda x: x[1] >= 500)

In [60]:
frequent_bigrams.take(5)

[(u'best_known', 507),
 (u'soviet_union', 905),
 (u'catholic_church', 596),
 (u'1_1', 1180),
 (u'science_fiction', 509)]

### 2.2. NPMI

Общее кол-во слов в тексте

In [61]:
flat_words = filtered_lists.flatMap(lambda x: x)

In [62]:
flat_words.take(5)

[u'anarchism', u'anarchism', u'defined', u'political', u'philosophy']

In [63]:
total_number_of_words = flat_words.count()

In [64]:
total_number_of_words

7258563

In [65]:
total_number_of_words = sc.broadcast(total_number_of_words)

In [66]:
word_count = flat_words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)

In [67]:
word_count.take(5)

[(u'fawn', 7),
 (u'biennials', 10),
 (u'vexillifera', 1),
 (u'gai', 3),
 (u'tripolitan', 2)]

In [68]:
# вероятность увидеть слово “a” в датасете
p_a = word_count.map(lambda x: (x[0], float(x[1])/total_number_of_words.value)).cache()

In [69]:
p_a.take(5)

[(u'biennials', 1.377683158498452e-06),
 (u'tripolitan', 2.755366316996904e-07),
 (u'vexillifera', 1.377683158498452e-07),
 (u'refreshable', 1.2399148426486068e-06),
 (u'nunnery', 6.88841579249226e-07)]

Общее кол-во пар слов в тексте

In [70]:
total_number_of_pairs = upd_bigrams.count()

In [71]:
total_number_of_pairs

7254463

In [72]:
total_number_of_pairs = sc.broadcast(total_number_of_pairs)

In [77]:
# вероятность увидеть пару слов “a” и “b”, идущих подряд
p_ab = frequent_bigrams.map(lambda x: (x[0], float(x[1])/total_number_of_pairs.value)).cache()

In [78]:
p_ab.take(5)

[(u'best_known', 6.988801238630619e-05),
 (u'soviet_union', 0.00012475079134044795),
 (u'catholic_church', 8.215632225293589e-05),
 (u'1_1', 0.0001626584903665509),
 (u'science_fiction', 7.016370474285968e-05)]

In [79]:
p_ab.count()

74

Расчет по формуле:
$$ PMI(a,b) = ln( \dfrac{P(ab)}{P(a) * P(b)}) $$

$$ NPMI(a,b) = \dfrac{PMI(a,b)}{-ln(P(ab))} $$

In [80]:
part_1 = p_ab.map(lambda x: (x[0].split('_')[0], x))\
             .join(p_a)

In [81]:
part_1.take(5)

[(u'references',
  ((u'references_reading', 7.140432034735031e-05), 0.000561681423719819)),
 (u'references',
  ((u'references_external', 0.00017216987666764582), 0.000561681423719819)),
 (u'u', ((u'u_s', 0.0006227890334543026), 0.0008949429797605945)),
 (u'soviet',
  ((u'soviet_union', 0.00012475079134044795), 0.00034841607078425855)),
 (u'e', ((u'e_g', 0.0002716948173834507), 0.0011101370891180528))]

In [82]:
part_2 = part_1.map(lambda x: (x[1][0][0].split('_')[1], x[1]))\
             .join(p_a)

In [83]:
part_2.take(5)

[(u'songwriter',
  (((u'singer_songwriter', 0.0001564554123440977), 0.00037831179532367496),
   0.00018901812934598764)),
 (u'singer',
  (((u'american_singer', 0.00012516432987527814), 0.0025011837742539397),
   0.00037831179532367496)),
 (u'union',
  (((u'soviet_union', 0.00012475079134044795), 0.00034841607078425855),
   0.0005062985607481811)),
 (u'zealand',
  (((u'new_zealand', 0.00012006402127903884), 0.0026268284783089986),
   0.000123715947633161)),
 (u'africa',
  (((u'south_africa', 8.574032288813107e-05), 0.0008939786015496456),
   0.0002873847068627771))]

In [84]:
# x[1][0][0][0] - bigramm
# x[1][0][0][1] - P(ab)
# x[1][0][1] - P(a)
# x[1][1] - P(b)

part_3 = part_2.map(lambda x: (x[1][0][0][0], log(x[1][0][0][1]/(x[1][0][1]*x[1][1]))/(-log(x[1][0][0][1]))))\
               .collect()

In [85]:
part_3[:5]

[(u'singer_songwriter', 0.8776616015819709),
 (u'american_singer', 0.5436193557102699),
 (u'soviet_union', 0.7299103610286892),
 (u'new_zealand', 0.6548905578791357),
 (u'south_africa', 0.6204841894171089)]

In [86]:
TOP = 39

for key, value in sorted(part_3, key=lambda x: x[1], reverse=True)[:TOP]:
    print key, '\t', round(value, 3)

los_angeles 	0.974
external_links 	0.949
united_states 	0.884
prime_minister 	0.882
singer_songwriter 	0.878
san_francisco 	0.855
isbn_978 	0.845
new_york 	0.792
supreme_court 	0.776
19th_century 	0.767
20th_century 	0.76
references_external 	0.734
soviet_union 	0.73
science_fiction 	0.726
isbn_0 	0.707
air_force 	0.699
united_kingdom 	0.689
e_g 	0.689
university_press 	0.686
baseball_player 	0.682
roman_catholic 	0.68
18th_century 	0.68
don_t 	0.674
references_reading 	0.669
notes_references 	0.663
new_zealand 	0.655
978_0 	0.652
award_best 	0.648
north_america 	0.641
civil_war 	0.639
catholic_church 	0.625
south_africa 	0.62
took_place 	0.613
war_ii 	0.613
world_war 	0.61
united_nations 	0.609
roman_empire 	0.607
x_y 	0.606
u_s 	0.561
