In [1]:
!pip install -q pyspark

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

conf = SparkConf().setAppName('InvertedIndex') \
                  .set("spark.dynamicAllocation.enabled", "true") \
                  .set("spark.executor.memory", "40g") \
                  .set("spark.executor.cores", "2")

sc = SparkContext.getOrCreate(conf=conf)

sqlContext = SparkSession.builder \
        .master("local") \
        .appName("Colab") \
        .config('spark.ui.port', '4050') \
        .getOrCreate()

In [3]:
from nltk.stem.lancaster import LancasterStemmer

st = LancasterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os

path_to_chunks = '/content/drive/MyDrive/Colab Notebooks/wiki_testing'

text_files_rdd = sc.wholeTextFiles(path_to_chunks)

file_names_list = text_files_rdd.keys().map(lambda path: os.path.basename(path)).collect()

['wiki2022_small.000008', 'wiki2022_small.000007', 'wiki2022_small.000009']


In [6]:
from dateutil.parser import parse
import datetime
import re

# getting dates in consistent format

def _fix_date(date):
  try:
    bool(parse(date))
    dt = parse(date, default=datetime.datetime(300, 1, 1))
    if dt.year != 300:
        date = dt.strftime("%B %d %Y")
    else:
        date = dt.strftime("%B %d")
    return date
  except:
    return date

# getting rid of special chararacters

def remove_special_chars(word):
  if not word.isalpha():
     word = re.sub(r'[^\w\s]', '', word)
  return word

# creating document ids

def extract_id(doc):
  matches = re.findall(r"curid=(\d+)\s+(.+?)(?=\s*https://|\s*$)", doc[1])
  return [(match[1].strip(), match[0], doc[0]) for match in matches]

# chunk id

def extract_chunk_id(file_path):
  return int(file_path[-2:])

# recognizing dates

month = r"([Jj]anuary|[Ff]ebruary|[Mm]arch|[Aa]pril|[Mm]ay|[Jj]une|[Jj]uly|[Aa]ugust|[Ss]eptember|[Oo]ctober|[Nn]ovember|[Dd]ecember)"
day_and_year = r"\s(\d{1,4})(?:st|nd|rd|th)?,?\s?(\d{4})?"
#day_and_year = r"\s(\d{1,4}),?\s?(\d{4})?"

expression = f'({month}{day_and_year})'

slashed_dates = r"\b(\d{1,2})/(0?\d{1,2}|1[0-2])/(?:\d{2}|\d{4})\b"
slashed_expression = f'({slashed_dates})'

date_expression = expression + r"|" + slashed_expression

# doc freq

from math import log2

def inverse_doc_freq(doc_num, doc_freq):
  return log2(doc_num / doc_freq)

In [7]:
def preprocessing(doc):
  matches = re.findall(date_expression, doc[0])
  dates = [match[0] for match in matches if match[0]]

  words, docID, chunk = doc
  words = words.split()[1:]
  for date in dates:
    words.append(_fix_date(date))

  return [((st.stem(remove_special_chars(word)),
           docID,
            chunk),
           1) for word in words]

In [22]:
def local_inverted_index(input_file):
    document = text_files_rdd.map(lambda x: (extract_chunk_id(x[0]), x[1]))\
    .flatMap(lambda x: extract_id(x))

    docIDs = document.flatMap(lambda doc: preprocessing(doc)).reduceByKey(lambda x, y: x + y)\
.map(lambda x: (x[0][0], x[0][1], x[1], x[0][2])).sortBy(lambda x: x)\
.filter(lambda x: x[0] != '')

    inverted_index = docIDs.map(lambda x: ((x[0], x[1], x[3]), x[2])) \
                           .reduceByKey(lambda x, y: x + y) \
                           .map(lambda x: ((x[0][0],x[0][2]), [(x[0][1], x[1])])) \
                           .reduceByKey(lambda x, y: x + y)\
                           .sortBy(lambda x: x)\
                           .map(lambda x: (x[0][0], len(x[1]), x[1], x[0][1]))\
                           .filter(lambda x: x[1] > 1)
    return inverted_index

In [None]:
without_word_codes = local_inverted_index(text_files_rdd)

filtered_rdd = without_word_codes.filter(lambda x: x[3] == 7)

doc_num = filtered_rdd.filter(lambda x: x[0] == 'the').collect()[0][1]

filtered_rdd = filtered_rdd.map(lambda x: (doc_num, sum([term[1] for term in x[2]]), x[1],
                                        x[0]))\
                        .map(lambda x: (x[3], inverse_doc_freq(x[0], x[2])))\
                        .filter(lambda x: x[1] <= 1)

unimportant_words = filtered_rdd.keys().collect()

In [120]:
import numpy as np

without_word_codes = without_word_codes.filter(lambda x: x[0] not in unimportant_words)

arr = np.unique(without_word_codes.keys().collect())

word_codes_dict = {element: index for index, element in enumerate(arr)}

joined_rdd = without_word_codes.map(lambda x: (x[0], (x[1:], word_codes_dict.get(x[0]))))

final_rdd = joined_rdd.map(lambda x: (x[1][1], x[0], x[1][0][0], x[1][0][1], x[1][0][2]))

In [141]:
array_str = '\n'.join(arr)

with open('dictionary.txt', 'w') as f:
    f.write(array_str)

In [121]:
def produce_output_file(file_path, rdd):
  chunk_num = extract_chunk_id(file_path)

  filtered_rdd = rdd.filter(lambda x: x[4] == chunk_num)\
  .map(lambda x: f'{x[0]} {x[1]} {x[2]} {x[3]}')

  filtered_rdd = filtered_rdd.coalesce(1)
  filtered_rdd.saveAsTextFile(f'index{chunk_num}')

In [122]:
for file_name in file_names_list:
  produce_output_file(file_name, final_rdd)