## **Imports and setup :**


In [None]:
!gcloud dataproc clusters list --region us-central1
!pip install -q google-cloud-storage==1.43.0
!pip install -q graphframes
!pip install nltk==3.7

In [None]:
import pyspark
import sys
from collections import Counter, OrderedDict, defaultdict
import itertools
from itertools import islice, count, groupby
import pandas as pd
import os
import re
from operator import itemgetter
import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords
from time import time
from timeit import timeit
from pathlib import Path
import pickle
import numpy as np
from google.cloud import storage
import math
import hashlib
import builtins

def _hash(s):
    return hashlib.blake2b(bytes(s, encoding='utf8'), digest_size=5).hexdigest()

nltk.download('stopwords')

In [None]:
!ls -l /usr/lib/spark/jars/graph*

In [None]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf, SparkFiles
from pyspark.sql import SQLContext
from graphframes import *

In [None]:
spark

In [None]:
# Define the bucket name and set up paths for data retrieval
bucket_name = '318940913'
full_path = f"gs://{bucket_name}/"
paths=[]

client = storage.Client()
blobs = client.list_blobs(bucket_name)
for b in blobs:
    if not b.name.startswith('Postings_title/') and not b.name.startswith('Dict_folder/') and not b.name.startswith('Postings_body/') and b.name != 'graphframes.sh':
        paths.append(full_path+b.name)

In [None]:
# Read Parquet file from the specified paths
parquetFile = spark.read.parquet(*paths)

In [None]:
# Check if the file inverted_index_gcp.py exists in the home directory
%cd -q /home/dataproc
!ls inverted_index_gcp.py

In [None]:
# adding our python module to the cluster
sc.addFile("/home/dataproc/inverted_index_gcp.py")
sys.path.insert(0,SparkFiles.getRootDirectory())

In [None]:
from inverted_index_gcp import *

In [None]:
# Extract anchor and document ID pairs from the Parquet file
doc_anchor_pairs = parquetFile.select("id","anchor_text").rdd

In [None]:
def generate_graph(pages):
  # Define a function to extract unique page IDs and anchor texts
  def get_ids_anchor_text_tuple(row):
    lst = []
    for r in row:
      lst.append(r[0])
    return list(set(lst))
  # Define a function to generate edges between page IDs
  def get_edges(ids):
    lst = []
    for id in ids[1]:
      lst.append((ids[0], id))
    return lst
  # Define a function to generate vertices from edges
  def get_vertices(edges):
    lst = []
    for i in edges:
      lst.append((i,))
    return lst
  # Map page IDs to their respective anchor text lists
  ids = pages.mapValues(get_ids_anchor_text_tuple)
  # Generate edges between page IDs
  edges = ids.flatMap(get_edges)
  # Generate vertices from edges
  vertices = edges.flatMap(get_vertices).distinct()
  return edges, vertices


In [None]:
# Generate graph from document anchor pairs
edges, vertices = generate_graph(doc_anchor_pairs)
# Convert RDDs to DataFrames and repartition
edgesDF = edges.toDF(['src', 'dst']).repartition(124, 'src')
verticesDF = vertices.toDF(['id']).repartition(124, 'id')
# Create a GraphFrame from vertices and edges
g = GraphFrame(verticesDF, edgesDF)
# Run PageRank algorithm on the graph
pr_results = g.pageRank(resetProbability=0.15, maxIter=6)
pr = pr_results.vertices.select("id", "pagerank")
pr_pandas = pr.toPandas()
pr_dict = pr_pandas.set_index('id').to_dict()['pagerank']

# Save the dictionary as pickle file
x="page_rank"
file_name = f"{x}.pickle"
folder_name = "Page_Rank"
with open(file_name, "wb") as f:
    pickle.dump(pr_dict, f)

client = storage.Client()
bucket = client.bucket(bucket_name)

blob_page_rank = bucket.blob(f"{folder_name}/{file_name}")
blob_page_rank.upload_from_filename(file_name)