In [None]:
## **Imports and setup :**


In [None]:
!gcloud dataproc clusters list --region us-central1
!pip install -q google-cloud-storage==1.43.0
!pip install -q graphframes
!pip install nltk==3.7

In [None]:
import pyspark
import sys
from collections import Counter, OrderedDict, defaultdict
import itertools
from itertools import islice, count, groupby
import pandas as pd
import os
import re
from operator import itemgetter
import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords
from time import time
from timeit import timeit
from pathlib import Path
import pickle
import numpy as np
from google.cloud import storage
import math
import hashlib
import builtins

def _hash(s):
    return hashlib.blake2b(bytes(s, encoding='utf8'), digest_size=5).hexdigest()

nltk.download('stopwords')

In [None]:
!ls -l /usr/lib/spark/jars/graph*

In [None]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf, SparkFiles
from pyspark.sql import SQLContext
from graphframes import *

In [None]:
spark

In [None]:
# Define the bucket name and set up paths for data retrieval
bucket_name = '318940913'
full_path = f"gs://{bucket_name}/"
paths=[]

client = storage.Client()
blobs = client.list_blobs(bucket_name)
for b in blobs:
    if not b.name.startswith('Postings_title/') and not b.name.startswith('Postings_body/') and not b.name.startswith('Dict_folder/') and b.name != 'graphframes.sh':
        paths.append(full_path+b.name)


In [None]:
# Read Parquet file from the specified paths
parquetFile = spark.read.parquet(*paths)

In [None]:
# Check if the file inverted_index_gcp.py exists in the home directory
%cd -q /home/dataproc
!ls inverted_index_gcp.py

In [None]:
# adding our python module to the cluster
sc.addFile("/home/dataproc/inverted_index_gcp.py")
sys.path.insert(0,SparkFiles.getRootDirectory())

In [None]:
from inverted_index_gcp import *

In [None]:
# Extract text, title and document ID pairs from the Parquet file
doc_text_pairs = parquetFile.select("text", "id").rdd
doc_title_pairs = parquetFile.select("title", "id").rdd

## Helper functions:


In [None]:
english_stopwords = frozenset(stopwords.words('english'))
corpus_stopwords = ["category", "references", "also", "external", "links",
                    "may", "first", "see", "history", "people", "one", "two",
                    "part", "thumb", "including", "second", "following",
                    "many", "however", "would", "became"]

all_stopwords = english_stopwords.union(corpus_stopwords)
RE_WORD = re.compile(r"""[\#\@\w](['\-]?\w){2,24}""", re.UNICODE)

NUM_BUCKETS = 124

def tokenize(query):
    """
    Tokenize the input query and remove stopwords.
    Parameters:
        query (str): The input text to be tokenized.
    Returns:
        list: List of tokens after tokenization, stemming and stopword removal.
    """
    stemmer = PorterStemmer()
    return [stemmer.stem(token.group()) for token in RE_WORD.finditer(query.lower()) if token.group() not in all_stopwords]

NUM_BUCKETS = 124
def token2bucket_id(token):
    """
    Map a token to a bucket ID for partitioning postings.
    Parameters:
        token (str): The input token.
    Returns:
        int: The bucket ID for the token.
    """
    return int(_hash(token),16) % NUM_BUCKETS

def build_norm_doc(tok_text):
    """
    Build the normalized document vector.
    Parameters:
        tok_text (list): List of tokens in the document.
    Returns:
        float: Normalized document vector.
    """
    temp_dict = {}
    for term in list(tok_text):
        if term not in temp_dict:
            temp_dict[term] = 0
        temp_dict[term] +=1
    sum = 0
    for term in temp_dict:
        sum += temp_dict[term]**2
    if sum == 0:
        return sum
    return 1/math.sqrt(sum)


## Create a dictionary that includes (doc_id : doc_len)


In [None]:
# Map document ID to document length and save to a dictionary
Docs_len_dict= doc_text_pairs.map(lambda x: (x[1], len(tokenize(x[0])))).collectAsMap()

x="Docs_len_Body_Dict"
file_name = f"{x}.pickle"
folder_name = "Dict_folder"
print("file name : ",file_name)

with open(file_name, "wb") as f:
    pickle.dump(Docs_len_dict, f)

client = storage.Client()
bucket = client.bucket(bucket_name)

blob_Docs_len_dict = bucket.blob(f"{folder_name}/{file_name}")
blob_Docs_len_dict.upload_from_filename(file_name)

## Create a dictionary that includes (doc_id : title)

In [None]:
# Map document ID to document title and save to a dictionary
id_title_dict=dict(doc_title_pairs.collectAsMap())

x="id_title_dict"
file_name = f"{x}.pickle"
folder_name = "Dict_folder"

with open(file_name, "wb") as f:
    pickle.dump(id_title_dict, f)

client = storage.Client()
bucket = client.bucket(bucket_name)

blob_id_title_dict = bucket.blob(f"{folder_name}/{file_name}")
blob_id_title_dict.upload_from_filename(file_name)

## Create a document with the average length of the corpus documents

In [None]:
# Calculate average document length and save to a file
total_docs = doc_text_pairs.count()
total_len_docs = doc_text_pairs.map(lambda x: len(tokenize(x[0]))).reduce(lambda x, y: x + y)
avg_doc_len = total_len_docs / total_docs

x="docs_avg_len"
file_name = f"{x}.pickle"
folder_name = "Dict_folder"
print("file name : ",file_name)

with open(file_name, "wb") as f:
    pickle.dump(avg_doc_len, f)

client = storage.Client()
bucket = client.bucket(bucket_name)

blob_avg_doc_len = bucket.blob(f"{folder_name}/{file_name}")
blob_avg_doc_len.upload_from_filename(file_name)

## Create a dictionary that includes (doc_id : norm)

In [None]:
# Map document ID to document norm and save to a dictionary
doc_text_pairs_new = doc_text_pairs.mapValues(tokenize).mapValues(build_norm_doc)
norm_dict= doc_text_pairs_new.collectAsMap()

x="norm_dict"
file_name = f"{x}.pickle"
folder_name = "Dict_folder"

with open(file_name, "wb") as f:
    pickle.dump(norm_dict, f)

client = storage.Client()
bucket = client.bucket(bucket_name)

# Create the folder if it doesn't exist
folder_blob = bucket.blob(folder_name)
if not folder_blob.exists():
    folder_blob.upload_from_string('')

blob_norm_dict = bucket.blob(f"{folder_name}/{file_name}")
blob_norm_dict.upload_from_filename(file_name)