In [1]:
from pyspark import SparkConf, SparkContext
sc = SparkContext(conf=SparkConf().setAppName("MyApp").setMaster("local"))

import re

def parse_article(line):
    try:
        article_id, text = unicode(line.rstrip()).split('\t', 1)
        text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
        words = re.split("\W*\s+\W*", text, flags=re.UNICODE)
        return words
    except ValueError as e:
        return []

In [None]:
wiki = sc.textFile("/data/wiki/en_articles_part/articles-part", 16).map(parse_article)
result = wiki.take(1)[0]

In [2]:
for word in result[:10]:
    print word

Anarchism
Anarchism
is
often
defined
as
a
political
philosophy
which


In [14]:
#count the words
num_words = wiki.count()
print(num_words)

4100


In [3]:
#transform them to lowercase
wiki_flat = wiki.flatMap(lambda x: [x[i].lower() for i in range(0,len(x))])
result = wiki_flat.take(10)
for word in result[:10]:
    print word

anarchism
anarchism
is
often
defined
as
a
political
philosophy
which


In [7]:
#filter out stopwords
stop_words = sc.textFile("/datasets/stop_words_en.txt", 16)
stop_words_data = stop_words.collect()
broadcast_var = sc.broadcast(stop_words_data)
wiki_filtered = wiki_flat.filter(lambda x: x not in broadcast_var.value)
result = wiki_filtered.take(10)
for word in result[:10]:
    print word

anarchism
anarchism
defined
political
philosophy
holds
state
undesirable
unnecessary
harmful


In [11]:
#compute the bigrams
bigrams = wiki_filtered.flatMap(lambda x: [((x[i]+'_'+x[i+1]),1) for i in range(0,len(x)-1)]) \
.reduceByKey(lambda x,y:x+y)
result = bigrams.take(100)
for word in result[:100]:
    print word

(u'0_.', 5963)
(u'p_\xee', 2)
(u'\u1f30_\u03be', 1)
(u'\u6d59_\u6c5f', 1)
(u'-_\u043b', 2)
(u'\u0430_\u043e', 4)
(u'\u056b_\u0579', 2)
(u'\u1f05_\u03b3', 1)
(u'\u9577_\u58fd', 1)
(u'\u9999_\u69df', 1)
(u'c_\u0101', 15)
(u'2_x', 106)
(u'\u043a_\u0440', 24)
(u'\u0938_\u0926', 1)
(u'\u516b_\u8109', 2)
(u'\u03b5_\u03b3', 3)
(u'\xf2_h', 1)
(u'0_n', 34)
(u'\u05e1_\u05d7', 1)
(u'\u0259_/', 1)
(u'a_\u1e37', 2)
(u'\u6247_\u8d1d', 1)
(u'\u516d_\u6deb', 1)
(u'6_\u2154', 5)
(u'\u03bf_\u03b5', 2)
(u'?_e', 1)
(u'\u0902_\u0938', 1)
(u'+_9', 4)
(u'x_\u03b6', 1)
(u'a_\u0107', 15)
(u'\u03c3_a', 2)
(u'2_\u2308', 2)
(u'\u0623_\u0631', 1)
(u'\u0f58_\u0f66', 1)
(u'\u0161_\u0137', 1)
(u'z_\xe0', 35)
(u'p_>', 1)
(u'\u03b2_x', 6)
(u'\u6483_\u3061', 1)
(u'0_^', 111)
(u'\u0643_\u0631', 5)
(u'\u5638_\u8766', 1)
(u'\u016b_i', 1)
(u'\u03c4_z', 1)
(u'\u5c71_\u5927', 2)
(u'\u0937_\u094d', 1)
(u'\u30e3_\u30d1', 1)
(u'\u03bf_\u03d5', 1)
(u'\u2192_(', 1)
(u'\u0445_\u0443', 2)
(u'p_~', 1)
(u's_\u1ed1', 1)
(u'v_\u03b4', 1

In [13]:
#count the bigrams
num_bigrams = bigrams.values().sum()
print(num_bigrams)

40086090
