# Ngrams with pyspark

### Create a Spark context

In [5]:
from pyspark import SparkContext
from operator import add

filename = "wiki429MB"

sc = SparkContext(
    appName = "Ngrams with pyspark " + filename
)

### View Spark context

In [6]:
sc

### Check that data is there

We are going to use the file `/data/wiki429MB` that has been been previously uploaded to HDFS. The file has size $429$MB.

In [7]:
!hdfs dfs -ls -h /data/wiki429MB

-rw-r--r--   3 datalab hdfs    428.8 M 2020-02-14 08:54 /data/wiki429MB


### Create RDD from file

The second parameter ($80$)  indicates the desired number of partitions.

In [8]:
textFile = sc.textFile("/data/wiki429MB", 80)
print("textFile is of type: {}\nNumber of partitions: {}".format(type(textFile), textFile.getNumPartitions()))

textFile is of type: <class 'pyspark.rdd.RDD'>
Number of partitions: 80


### Generate trigrams

In [36]:
n = 3
ngrams = textFile \
            .flatMap(lambda x: [x.split()]) \
            .flatMap(lambda x: [tuple(y) for y in zip(*[x[i:] for i in range(n)])]) \
            .map(lambda x: (x, 1)) \
            .reduceByKey(add) \
            .sortBy(lambda x: x[1], ascending=False)

In [37]:
for (ngram, count) in ngrams.take(10):
    print("%s: %i" % (ngram, count))

('one', 'of', 'the'): 27795
('as', 'well', 'as'): 25145
('part', 'of', 'the'): 17984
('the', 'United', 'States'): 17224
('such', 'as', 'the'): 13886
('the', 'end', 'of'): 13878
('a', 'number', 'of'): 12986
('in', 'the', 'United'): 11760
('known', 'as', 'the'): 10172
('end', 'of', 'the'): 9842


### Stop context

Stop the current Spark context to free resources.

In [4]:
sc.stop()