In [1]:
from pyspark import SparkConf, SparkContext
sc = SparkContext(conf=SparkConf().setAppName("MyApp").setMaster("local"))

import re
from math import log
from __future__ import division

def parse_article(line):
    try:
        article_id, text = unicode(line.rstrip()).split('\t', 1)
        text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
        words = re.split("\W*\s+\W*", text, flags=re.UNICODE)
        return words
    except ValueError as e:
        return []
    
def make_bigrams(words):
    bigrams = []
    
    for i, word in enumerate(words[:-1]):
        bigrams.append((u'{}_{}'.format(word, words[i + 1]), 1))
    
    return bigrams

def words_from_bigram(bigram_struct):
    words = []
    parts = bigram_struct[0].split('_')
    
    for part in parts:
        words.append((u'{}'.format(part), 1))
        
    return words

def npmi(bigram_struct, word_count, wc, bc):
    bigram = bigram_struct[0]
    bigram_count = bigram_struct[1]
    
    p_bigram = bigram_count / bc
    
    first, second = bigram.split('_')
    p_a = word_count[first] / wc
    p_b = word_count[second] / wc
    
    pmi = log(p_bigram / (p_a * p_b))
    
    return (((-1) * pmi / log(p_bigram)), bigram)
        
with open('/datasets/stop_words_en.txt', 'r') as f:
    stop_words = set(f.read().split())
    
wiki = sc.textFile('/data/wiki/en_articles_part/articles-part', 16).map(parse_article)

In [2]:
source = wiki.map(lambda article: [word.lower() for word in article if word.lower() not in stop_words])

In [3]:
bigrams = source\
    .flatMap(make_bigrams)\
    .reduceByKey(lambda x, y: x + y)\
    .filter(lambda (bigram, count): count >= 500)

In [4]:
needle_words = bigrams.flatMap(words_from_bigram).reduceByKey(lambda x, y: 1).collect()

words = []
for word, _ in needle_words:
    words.append(word)

In [5]:
needle_words = source\
    .flatMap(lambda article: [(word, 1) for word in article if word in words])\
    .reduceByKey(lambda x, y: x + y)\
    .collect()
    
word_count = dict()
for word, count in needle_words:
    word_count[word] = count
    
wc = wiki.map(lambda article: len(article)).reduce(lambda x, y: x + y)
bc = wc - wiki.count();

In [6]:
result = bigrams\
    .map(lambda bigram_struct: npmi(bigram_struct, word_count, wc, bc))\
    .sortByKey(False)\
    .take(39)

los_angeles
external_links
united_states
prime_minister
san_francisco
et_al
new_york
supreme_court
19th_century
20th_century
references_external
soviet_union
air_force
baseball_player
university_press
united_kingdom
roman_catholic
references_reading
notes_references
award_best
north_america
new_zealand
civil_war
world_war
catholic_church
war_ii
south_africa
took_place
roman_empire
united_nations
american_singer-songwriter
high_school
american_actor
american_actress
american_baseball
york_city
american_football
years_later
north_american


In [None]:
for _, bigram in result:
    print(bigram)