In [1]:
!pip install -q pyspark

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

conf = SparkConf().setAppName('SparkWordCount')
sc = SparkContext.getOrCreate(conf = conf)

sqlContext = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [3]:
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

st = LancasterStemmer()
wl = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
input_file = sc.textFile("tiny_wikipedia.txt.gz")

In [5]:
#input_file = input_file.sample(withReplacement=False, fraction=0.0001, seed=42)

In [6]:
#input_file.collect()[:2]

In [7]:
import re
def remove_special_chars(word):
  if not word.isalpha():
     word = re.sub(r'[^\w\s]', '', word)
  return word

In [8]:
stop_words = stopwords.words("english")
stop_words.append('')

In [9]:
all_stems = input_file.map(lambda line: \
 [word_lower for word in line.split()[1:] \
  if (word_lower := remove_special_chars(word.lower())) not in stop_words])

all_stems_kv = all_stems.map(lambda doc: [(st.stem(word), 1) for word in doc])

In [10]:
global_freq = all_stems_kv.flatMap(lambda word: word)\
.reduceByKey(lambda a, b: a + b)

In [11]:
alphabetized_glob_freq = global_freq.sortBy(lambda word: word[0])
alphabetized_words = [word for word in alphabetized_glob_freq.keys().collect()]

with open("dictionary.txt", "w") as file:
    for word in alphabetized_words:
      file.write(word + "\n")

In [12]:
decreasing_glob_freq = global_freq.sortBy(lambda count: -count[1])

decreasing_tf = [(key, value) for (key, value) in decreasing_glob_freq.collect()]

In [13]:
doc_freq = all_stems_kv.map(lambda words: [pair for pair in set(words)])\
.flatMap(lambda word: word)\
.reduceByKey(lambda a, b: a + b)

#doc_freq.collect()[:10]

In [14]:
dictionary = sc.textFile("dictionary.txt")

# initializing dictionary that we will store word as key, word count/doc frequency/global frequency as the value
dict = {}

# for each word, create associated value of order that they are sorted by (alphabetical)
#for key, value in decreasing_glob_freq.collect():
for pair in decreasing_tf:
  dict[pair[0]] = [pair[1]]

# old code
idx = 0
for word in dictionary.collect():
  dict[word].append(idx)
  idx += 1

for key, value in doc_freq.collect():
  dict[key].append(value)

#print(dict)

In [15]:
with open("unigrams.txt", "w") as file:
  for key in dict.keys():
    file.write(f'{dict[key][1]}' + " ")
    file.write(f'|{key}|' + " ")
    file.write(f'{dict[key][2]}' + " ")
    file.write(f'{dict[key][0]}' + "\n")