In [11]:
import findspark
findspark.init('/home/ek/spark-2.4.4-bin-hadoop2.7')
import pyspark
import os
java8_location= '/usr/lib/jvm/java-8-openjdk-amd64' # Set your own
os.environ['JAVA_HOME'] = java8_location

In [26]:
import re
import time

In [46]:
# remove some of the punctuation and the linebreaks, lower the text
def preprocess(text):
    text = re.sub('(-\n)|,|\.|@','',text).lower()
    text = re.sub('\n|\t',' ',text).strip()
    if text[-1]=='-':
        text = text[:-1]
    return text

In [80]:
def add_postfix(p):
    (idx , (line , postfix )) = p
    if postfix is not None:
        return line + ' '+postfix
    return line

In [81]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('HW').getOrCreate()
spark.conf.set('spark.sql.shuffle.partitions',6)

In [82]:
data = spark.sparkContext.textFile ('text.txt') \
    .filter ( lambda x: len(x) > 0) \
    .zipWithIndex()\
    .map(lambda x: (x[1],x[0])) # swap the index and the text

In [83]:
data.take(5)

[(0, 'When a distinguished but elderly scientist states that'),
 (1, 'something is possible, he is almost certainly right.'),
 (2, 'When he states that something is impossible, he is very pro-'),
 (3, 'bably wrong.')]

In [84]:
data = data.map(lambda x: (x[0],preprocess(x[1])))

In [85]:
data.take(10)

[(0, 'when a distinguished but elderly scientist states that'),
 (1, 'something is possible he is almost certainly right'),
 (2, 'when he states that something is impossible he is very pro'),
 (3, 'bably wrong')]

In [86]:
k = 9
postfix = data.map( lambda x: (x[0] - 1, x [1][0:k - 2]))
data = data. leftOuterJoin ( postfix ). map( add_postfix )

In [87]:
data.take(5)

['when a distinguished but elderly scientist states that somethi',
 'something is possible he is almost certainly right when he',
 'when he states that something is impossible he is very pro bably w',
 'bably wrong']

In [89]:
shingles = data.flatMap ( lambda l: [l[i:i+k] for i in range(len(l) - k+1)])

In [90]:
shingles.collect()

['when a di',
 'hen a dis',
 'en a dist',
 'n a disti',
 ' a distin',
 'a disting',
 ' distingu',
 'distingui',
 'istinguis',
 'stinguish',
 'tinguishe',
 'inguished',
 'nguished ',
 'guished b',
 'uished bu',
 'ished but',
 'shed but ',
 'hed but e',
 'ed but el',
 'd but eld',
 ' but elde',
 'but elder',
 'ut elderl',
 't elderly',
 ' elderly ',
 'elderly s',
 'lderly sc',
 'derly sci',
 'erly scie',
 'rly scien',
 'ly scient',
 'y scienti',
 ' scientis',
 'scientist',
 'cientist ',
 'ientist s',
 'entist st',
 'ntist sta',
 'tist stat',
 'ist state',
 'st states',
 't states ',
 ' states t',
 'states th',
 'tates tha',
 'ates that',
 'tes that ',
 'es that s',
 's that so',
 ' that som',
 'that some',
 'hat somet',
 'at someth',
 't somethi',
 'something',
 'omething ',
 'mething i',
 'ething is',
 'thing is ',
 'hing is p',
 'ing is po',
 'ng is pos',
 'g is poss',
 ' is possi',
 'is possib',
 's possibl',
 ' possible',
 'possible ',
 'ossible h',
 'ssible he',
 'sible he ',
 'ible

In [20]:
dataset = spark.sparkContext.wholeTextFiles('text.txt')

##### Define function to preprocess the texts

In [22]:
dataset  = dataset.map(lambda x: (x[0],preprocess(x[1])))

###### Define function to return the set of shingles

In [23]:
def shingle(text, n):
    shingles = []
    i = 0
    while (i+n<len(text)):
        shingle = text[i:i+n]
        shingles.append(shingle)
        i+=1
    return set(shingles)

In [24]:
dataset = dataset.map(lambda x: (x[0],shingle(x[1],9)))

In [25]:
dataset.take(1) # check if the shingles are as required

[('file:/home/ek/text.txt',
  {' a distin',
   ' almost c',
   ' but elde',
   ' certainl',
   ' distingu',
   ' elderly ',
   ' he is al',
   ' he is ve',
   ' he state',
   ' impossib',
   ' is almos',
   ' is impos',
   ' is possi',
   ' is very ',
   ' possible',
   ' probably',
   ' right wh',
   ' scientis',
   ' somethin',
   ' states t',
   ' that som',
   ' very pro',
   ' when he ',
   'a disting',
   'ably wron',
   'ainly rig',
   'almost ce',
   'at someth',
   'ates that',
   'bably wro',
   'ble he is',
   'but elder',
   'certainly',
   'cientist ',
   'd but eld',
   'derly sci',
   'distingui',
   'e he is a',
   'e he is v',
   'e is almo',
   'e is very',
   'e states ',
   'ed but el',
   'elderly s',
   'en a dist',
   'en he sta',
   'entist st',
   'erly scie',
   'ertainly ',
   'ery proba',
   'es that s',
   'ething is',
   'g is impo',
   'g is poss',
   'ght when ',
   'guished b',
   'hat somet',
   'he is alm',
   'he is ver',
   'he states',
   'hed but 

In [33]:
now = time.time()
dataset = spark.sparkContext.wholeTextFiles('brd_grundgesetz_63_2019-04-03.txt')
dataset  = dataset.map(lambda x: (x[0],preprocess(x[1])))
dataset = dataset.map(lambda x: (x[0],shingle(x[1],9)))
stop = time.time()
print('total elapsed time: ', stop-now)

total elapsed time:  0.029866695404052734


In [35]:
dataset.collect()

[('file:/home/ek/brd_grundgesetz_63_2019-04-03.txt',
  {'s alter e',
   'e der jäh',
   'ehen völk',
   'ihre unab',
   '29 durch ',
   '6 luftver',
   'nachzuhol',
   'nd födera',
   'i des jah',
   ' als beme',
   'schriftst',
   'ndesrate;',
   'sgerichte',
   'nd gegen ',
   'ungsfrist',
   ' satz 1 e',
   'tragen wi',
   'ses zur ü',
   '(9) als e',
   'ben für d',
   'gebung vo',
   'feinander',
   'em dienst',
   'esvorlage',
   'er und ih',
   'im auftra',
   ' der eink',
   'tziehunge',
   'ikel 26 a',
   ' zu legen',
   ' von sech',
   'timmt sin',
   'andere be',
   'eht den g',
   'isses sin',
   'en petiti',
   'schuß die',
   'g von hau',
   'mai 1945 ',
   'esetz fes',
   ' das saar',
   'hte auf z',
   'änder auf',
   'derwahl i',
   'tz genann',
   't mehr be',
   'o ist dur',
   'zogene re',
   'ch dem 8 ',
   'an dem au',
   'deren ant',
   'treten sp',
   'aren bedü',
   'hoben art',
   ' kulturel',
   'em 1 augu',
   'die polit',
   'b dem jah',
   'straße da',
   

In [36]:
now = time.time()
dataset = spark.sparkContext.wholeTextFiles('brd_grundgesetz_63_2019-04-03.txt')
dataset  = dataset.map(lambda x: (x[0],preprocess(x[1])))
dataset = dataset.map(lambda x: (x[0],shingle(x[1],5)))
stop = time.time()
print('total elapsed time: ', stop-now)

total elapsed time:  0.03359198570251465


In [37]:
dataset.getNumPartitions()

1