In [None]:
!pip install -q pyspark

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

conf = SparkConf().setAppName('SparkWordCount')
sc = SparkContext.getOrCreate(conf = conf)

sqlContext = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [None]:
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

st = LancasterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
input_file = sc.textFile("tiny_wikipedia.txt.gz")

In [None]:
#input_file = input_file.sample(True, .0001)

In [None]:
from dateutil.parser import parse
import datetime

def _fix_date(date):
  try:
    bool(parse(date))
    dt = parse(date, default=datetime.datetime(300, 1, 1))
    if dt.year != 300:
        date = dt.strftime("%B %d %Y")
    else:
        date = dt.strftime("%B %d")
    return date
  except:
    return date

In [None]:
import re
def remove_special_chars(word):
  if not word.isalpha():
     word = re.sub(r'[^\w\s]', '', word)
  return word

def identify_dates(expression, doc):
  matches = re.findall(expression, doc)
  dates = [match[0] for match in matches if match[0]]

  doc = doc.split()
  for date in dates:
    doc.append(_fix_date(date))

  return doc

In [None]:
# recognizing dates
month = r"([Jj]anuary|[Ff]ebruary|[Mm]arch|[Aa]pril|[Mm]ay|[Jj]une|[Jj]uly|[Aa]ugust|[Ss]eptember|[Oo]ctober|[Nn]ovember|[Dd]ecember)"
day_and_year = r"\s(\d{1,4})(?:st|nd|rd|th)?,?\s?(\d{4})?"
#day_and_year = r"\s(\d{1,4}),?\s?(\d{4})?"

expression = f'({month}{day_and_year})'

slashed_dates = r"\b(\d{1,2})/(0?\d{1,2}|1[0-2])/(?:\d{2}|\d{4})\b"
slashed_expression = f'({slashed_dates})'

date_expression = expression + r"|" + slashed_expression


In [None]:
stop_words = stopwords.words("english")
stop_words.append('')
stop_words.append('also')

In [None]:
all_stems = input_file.map(lambda line: \
 [word_lower for word in identify_dates(date_expression, line)[1:] \
  if (word_lower := remove_special_chars(word.lower())) not in stop_words])

all_stems_kv = all_stems.map(lambda doc: [(st.stem(word), 1) for word in doc])

In [None]:
global_freq = all_stems_kv.flatMap(lambda word: word)\
.reduceByKey(lambda a, b: a + b)

In [None]:
doc_freq = all_stems_kv.map(lambda words: [pair for pair in set(words)])\
.flatMap(lambda word: word)\
.reduceByKey(lambda a, b: a + b)\
.sortBy(lambda x: x[0])

In [None]:
alphabetized = doc_freq.keys().zipWithIndex()

In [None]:
dictionary = alphabetized.map(lambda x: f"{x[0]}")
dictionary.saveAsTextFile("dictionary.txt")

In [None]:
unigram = alphabetized.join(doc_freq).join(global_freq)

In [None]:
unigram = unigram.sortBy(lambda word: -word[1][1])\
.map(lambda word: f"{word[1][0][0]} {word[0]} {word[1][0][1]} {word[1][1]}")

unigram.saveAsTextFile("unigrams.txt")