In this python notebook, a 'WordCounter' class is defined and tested with prepared short text sample.

### 1. Defining the wordCount class

In [1]:
import pandas as pd
from operator import add
import logging

import findspark
findspark.init()

import pyspark

class WordCounter:

    def __init__(self,file_name):
        self.fn = file_name
        
        # create spark context
        self.sc = pyspark.SparkContext.getOrCreate()
        
        logging.info("Word Counter got initialized!")
        
    def process(self):
        f = open(self.fn,"r")
        raw_text = f.readlines()[0]
        
        striped_text = raw_text.strip()
        striped_text_lower_case = striped_text.lower()
        words_list = striped_text_lower_case.split(" ")
        
        # attach number 1 to each word and form a list
        word_1_list= [word_1 for word_1 in map(lambda x: (x,1), words_list)]
        
        # reduce operation with pandas group by
        word_count_df = pd.DataFrame(word_1_list).groupby(0).sum()
        
        # turn pandas data frame into list
        word_count_list = [(word,count[0]) for word, count in zip(word_count_df.index,
                                                                  word_count_df.values.tolist())]
        return word_count_list
    
    def process_with_spark(self):
        data = self.sc.textFile(self.fn)
        word_count_list = data.flatMap(lambda x: x.split(' '))\
                              .map(lambda x: (x.lower(),1))\
                              .reduceByKey(add).collect()
        return word_count_list
        
        
    

## 2. Testing

### 2.1 Create a WordCounter object

In [2]:
file_name = "cv000_tok-11609.txt"
# create a word count class
word_counter = WordCounter(file_name)

### 2.2 Do word count with pandas and native library

In [3]:
word_count_list = word_counter.process()
word_count_list

[('"', 14),
 ('(', 12),
 (')', 12),
 (',', 61),
 ('--', 1),
 ('.', 33),
 ('1974', 1),
 (':', 9),
 (';', 2),
 ('a', 22),
 ('accompanying', 1),
 ('accuses', 1),
 ('achieves', 1),
 ('acted', 2),
 ('action', 1),
 ('actors', 2),
 ('affair', 3),
 ('after', 1),
 ('again', 2),
 ('against', 1),
 ('all', 2),
 ('always', 1),
 ('amongst', 1),
 ('an', 5),
 ('and', 23),
 ('anonymous', 1),
 ('antolek-oresek', 1),
 ('any', 1),
 ('are', 4),
 ('armor', 2),
 ('arms', 1),
 ('arrive', 1),
 ('arthur', 4),
 ("arthur's", 2),
 ('arthurian', 3),
 ('artist', 1),
 ('as', 6),
 ('at', 2),
 ('attempts', 1),
 ('back', 1),
 ('background', 1),
 ('bagpipe', 1),
 ('bagpipes', 1),
 ('balsan', 2),
 ('based', 1),
 ('battle', 3),
 ('battling', 1),
 ('be', 3),
 ('beating', 1),
 ('been', 1),
 ('before', 1),
 ('begins', 1),
 ('bernhard', 3),
 ('better', 1),
 ('between', 1),
 ('bickering', 1),
 ('blood', 1),
 ('bloodshed', 1),
 ('bodies', 1),
 ('boldly', 1),
 ('bothersome', 1),
 ('bresson', 10),
 ("bresson's", 3),
 ('burning', 1

### 2.3 Do word count with Apache Spark

In [4]:
word_count_list_2 = word_counter.process_with_spark()
word_count_list_2

[(u'', 1),
 (u'emotion', 1),
 (u'all', 2),
 (u'shot', 1),
 (u'anonymous', 1),
 (u'tales', 1),
 (u'over', 1),
 (u'thomas', 1),
 (u"film's", 1),
 (u'laura', 2),
 (u'against', 1),
 (u'attempts', 1),
 (u'including', 1),
 (u'battle', 3),
 (u'chivalry', 1),
 (u'questionable', 1),
 (u'based', 1),
 (u'style', 2),
 (u'group', 1),
 (u'burning', 1),
 (u'personal', 2),
 (u'had', 1),
 (u'emphasis', 1),
 (u'better', 1),
 (u'only', 5),
 (u'creaking', 1),
 (u'rhythm', 1),
 (u'master', 1),
 (u'passion', 1),
 (u'might', 1),
 (u'acted', 2),
 (u'kingdom', 2),
 (u'them', 3),
 (u"arthur's", 2),
 (u'sums', 1),
 (u'trees', 1),
 (u'rest', 1),
 (u'they', 2),
 (u'not', 3),
 (u'exact', 1),
 (u';', 2),
 (u'vladimir', 1),
 (u'--', 1),
 (u'notes', 1),
 (u'background', 1),
 (u'arthur', 4),
 (u'through', 1),
 (u'roads', 1),
 (u'ruins', 1),
 (u'heavy', 2),
 (u'troyes', 1),
 (u'retrieve', 1),
 (u'fascinating', 1),
 (u'camelot', 4),
 (u'ench', 1),
 (u'achieves', 1),
 (u'whispering', 1),
 (u'rate', 1),
 (u'value', 1),
 (u