In [1]:
!pip install -q pyspark

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

conf = SparkConf().setAppName('SparkWordCount')
sc = SparkContext.getOrCreate(conf = conf)

sqlContext = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [3]:
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

st = LancasterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
input_file = sc.textFile("tiny_wikipedia.txt.gz")

In [5]:
#input_file = input_file.sample(True, .0001)

In [6]:
#input_file.collect()[:2]

In [7]:
from dateutil.parser import parse
import datetime

def _fix_date(date):
  try:
    bool(parse(date))
    dt = parse(date, default=datetime.datetime(300, 1, 1))
    if dt.year != 300:
        date = dt.strftime("%B %d %Y")
    else:
        date = dt.strftime("%B %d")
    return date
  except:
    return date

In [8]:
import re
def remove_special_chars(word):
  if not word.isalpha():
     word = re.sub(r'[^\w\s]', '', word)
  return word

def identify_dates(expression, doc):
  matches = re.findall(expression, doc)
  dates = [match[0] for match in matches if match[0]]

  doc = doc.split()
  for date in dates:
    doc.append(_fix_date(date))

  return doc

In [9]:
# recognizing dates
month = r"([Jj]anuary|[Ff]ebruary|[Mm]arch|[Aa]pril|[Mm]ay|[Jj]une|[Jj]uly|[Aa]ugust|[Ss]eptember|[Oo]ctober|[Nn]ovember|[Dd]ecember)"
day_and_year = r"\s(\d{1,4})(?:st|nd|rd|th)?,?\s?(\d{4})?"
#day_and_year = r"\s(\d{1,4}),?\s?(\d{4})?"

expression = f'({month}{day_and_year})'

slashed_dates = r"\b(\d{1,2})/(0?\d{1,2}|1[0-2])/(?:\d{2}|\d{4})\b"
slashed_expression = f'({slashed_dates})'

date_expression = expression + r"|" + slashed_expression


In [10]:
stop_words = stopwords.words("english")
stop_words.append('')

In [11]:
all_stems = input_file.map(lambda line: \
 [word_lower for word in identify_dates(date_expression, line)[1:] \
  if (word_lower := remove_special_chars(word.lower())) not in stop_words])

all_stems_kv = all_stems.map(lambda doc: [(st.stem(word), 1) for word in doc])

In [12]:
#all_stems = input_file.map(lambda line: \
# [word_lower for word in line.split()[1:] \
#  if (word_lower := remove_special_chars(word.lower())) not in stop_words])

#all_stems_kv = all_stems.map(lambda doc: [(st.stem(word), 1) for word in doc])

In [13]:
global_freq = all_stems_kv.flatMap(lambda word: word)\
.reduceByKey(lambda a, b: a + b)

In [14]:
alphabetized_glob_freq = global_freq.sortBy(lambda word: word[0])
alphabetized_words = [word for word in alphabetized_glob_freq.keys().collect()]

with open("dictionary.txt", "w") as file:
    for word in alphabetized_words:
      file.write(word + "\n")

In [15]:
decreasing_glob_freq = global_freq.sortBy(lambda count: -count[1])

decreasing_tf = [(key, value) for (key, value) in decreasing_glob_freq.collect()]

In [16]:
doc_freq = all_stems_kv.map(lambda words: [pair for pair in set(words)])\
.flatMap(lambda word: word)\
.reduceByKey(lambda a, b: a + b)

doc_freq.cache()

PythonRDD[12] at RDD at PythonRDD.scala:53

In [19]:
dictionary = sc.textFile("dictionary.txt")

# initializing dictionary that we will store word as key, word count/doc frequency/global frequency as the value
dict = {}

# for each word, create associated value of order that they are sorted by (alphabetical)
#for key, value in decreasing_glob_freq.collect():

#for pair in decreasing_tf:
#  dict[pair[0]] = [pair[1]]

for pair in decreasing_tf:
  dict[pair[0]] = [pair[1], None, None]

# old code
idx = 0
for word in dictionary.collect():
  dict[word][1] = idx
  idx += 1

for key, value in doc_freq.collect():
  dict[key][2] = value

#print(dict)

In [20]:
with open("unigrams.txt", "w") as file:
  for key in dict.keys():
    file.write(f'{dict[key][1]}' + " ")
    file.write(f'|{key}|' + " ")
    file.write(f'{dict[key][2]}' + " ")
    file.write(f'{dict[key][0]}' + "\n")