## **Install packages if not yet installed**

In [1]:
import sys

!{sys.executable} -m pip install bs4 # BeautifulSoup
!{sys.executable} -m pip install opendatasets # OpenDatasets
!{sys.executable} -m pip install pyspark # PySpark

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25l[?25hdone
  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1256 sha256=00cd76fa2dbb1d77fccdb4ecb76869dffec0d9cb3abeb3e13688a285838e2557
  Stored in directory: /root/.cache/pip/wheels/25/42/45/b773edc52acb16cd2db4cf1a0b47117e2f69bb4eb300ed0e70
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1
Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22
Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packa

## **Reading the dataset**

**1.** Create a file `kaggle.json` and save your Kaggle username and API key. This will be used to download the dataset from Kaggle.

**2.** The URL of the dataset is [https://www.kaggle.com/datasets/ashishjangra27/geeksforgeeks-articles](https://www.kaggle.com/datasets/ashishjangra27/geeksforgeeks-articles "GeeksForGeeks Articles Dataset"). Using `opendatasets` package, download the dataset. Step 1 is required in order for this to automatically take in your username and API key.

**3.** Create a Spark Session to start working with PySpark.

**4.** Read the downloaded dataset.

In [2]:
import json
import opendatasets as od
from pyspark.sql import SparkSession

In [3]:
# Creating kaggle.json file.
with open("kaggle.json", "w") as kaggleFile:
    kaggleFile.write(json.dumps({"username":"shivanielakurthy", "key":"da7b4ae4bd1b770cb8b74d3990fc7f43"}))

In [4]:
# Downloading the dataset.
od.download("https://www.kaggle.com/datasets/ashishjangra27/geeksforgeeks-articles")

Downloading geeksforgeeks-articles.zip to ./geeksforgeeks-articles


100%|██████████| 1.31M/1.31M [00:00<00:00, 2.46MB/s]







In [5]:
# Create a Spark Session.
spark=SparkSession.builder.config('spark.app.name', 'geeks_for_geeks_articles').getOrCreate()

In [6]:
# Reading the dataset.
articles=spark.read.option('header', True)\
          .option('inferSchema', True)\
          .csv(r"geeksforgeeks-articles/articles.csv")
articles.show(5, truncate=False)

+--------------------------------------------+----------------+------------+---------------------------------------------------------------------------+--------+
|title                                       |author_id       |last_updated|link                                                                       |category|
+--------------------------------------------+----------------+------------+---------------------------------------------------------------------------+--------+
|5 Best Practices For Writing SQL Joins      |priyankab14     |21 Feb, 2022|https://www.geeksforgeeks.org/5-best-practices-for-writing-sql-joins/      |easy    |
|Foundation CSS Dropdown Menu                |ishankhandelwals|20 Feb, 2022|https://www.geeksforgeeks.org/foundation-css-dropdown-menu/                |easy    |
|Top 20 Excel Shortcuts That You Need To Know|priyankab14     |17 Feb, 2022|https://www.geeksforgeeks.org/top-20-excel-shortcuts-that-you-need-to-know/|easy    |
|Servlet – Fetching Result  

## **Dropping rows with null values**

In [7]:
articles=articles.dropna()

## **Creating a pandas-on-Spark Dataframe**

In [None]:
import pyspark.pandas as ps
articles=ps.DataFrame(articles)
articles.loc[1:5]

## **Scrap text from the URL to get article content**

In [9]:
from bs4 import BeautifulSoup
from pyspark.sql.functions import lit, col, udf
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

In [10]:
# Add new column to save the scrapped text from the URLs.
articles["text"]=""
articles.loc[1:5]

+--------------------------------------------+----------------+------------+---------------------------------------------------------------------------+--------+----+
|title                                       |author_id       |last_updated|link                                                                       |category|text|
+--------------------------------------------+----------------+------------+---------------------------------------------------------------------------+--------+----+
|5 Best Practices For Writing SQL Joins      |priyankab14     |21 Feb, 2022|https://www.geeksforgeeks.org/5-best-practices-for-writing-sql-joins/      |easy    |    |
|Foundation CSS Dropdown Menu                |ishankhandelwals|20 Feb, 2022|https://www.geeksforgeeks.org/foundation-css-dropdown-menu/                |easy    |    |
|Top 20 Excel Shortcuts That You Need To Know|priyankab14     |17 Feb, 2022|https://www.geeksforgeeks.org/top-20-excel-shortcuts-that-you-need-to-know/|easy    |    

In [None]:
# Dictionary to save the errors occurred while scrapping text.
scrapTextErrors={}

In [None]:
# Set timeout.
TIMEOUT_SECS=60

In [11]:
# Define a function to scrap text.
def scrapText(i, link):
    try:
        page=requests.get(link).text
        parser=BeautifulSoup(page, "html.parser")

        # Get the inner HTML of <div class="text"></div> tag. This consists of the main content.
        # Instead of recursively finding this tag with the above class name, I'm going iteratively to avoid max recursion errors.
        parser=parser.find("html", recursive=False)
        parser=parser.find("body", recursive=False)
        parser=parser.find("div", id="main", recursive=False)
        parser=parser.find("div", id="home-page", recursive=False)
        parser=parser.find("div", class_="article-page_flex", recursive=False)
        parser=parser.find("div", class_="leftBar", recursive=False)
        parser=parser.find("div", class_="article--viewer", recursive=False)
        parser=parser.find("div", class_="article--viewer_content", recursive=False)
        parser=parser.find("div", class_="a-wrapper", recursive=False)
        parser=parser.find("article", recursive=False)
        
        text=[""]
        for tag in parser.find("div", class_="text", recursive=False).contents:
            # Ignore all the <div> tags inside <div class="text"></div> as they do not have any
            # main content.
            if tag.name!="div":
                text.append(" ".join(tag.stripped_strings))
        # Return the main content.
        return i, "\n".join(text).strip("\n")
    
    except Exception as err:
        # print(f"ScrapText error ({i}, {link}) : {err}")
        scrapTextErrors[i]={"link": link, "error": err}
    return i, ""

In [None]:
# Run the above function for all the links using multithreading.
%%time
future_to_url={}
futureResultErrors=[]
with ThreadPoolExecutor(max_workers=1000) as executor:
    for i in range(1, articles.shape[0]):
        future_to_url[executor.submit(scrapText, i, articles.loc[i, "link"])]=i
        
    for future in as_completed(future_to_url):
        try:
            i, text=future.result(timeout=TIMEOUT_SECS)
            articles.loc[i, "text"]=text
        except Exception as err:
            # print(f"Future result error ({i}) : {err}")
            futureResultErrors.append(err)

In [None]:
# View the value of the text column for any row.
articles.loc[0, "text"]

In [None]:
# Number of articles with empty values in the "text" column.
articles.filter(articles["text"]=="")

## **Save errors to a file**

In [None]:
# Add the futureResultErrors to the scrapTextErrors and save to a file.
scrapTextErrors["futureResult"]=futureResultErrors
with open("ScrapTextErrors.json", "w") as errorsFile:
    errorsFile.write(json.dumps(scrapTextErrors))

## **Write to .parquet file**

In [15]:
# Write to Azure Blob Storage?

## **Stop the Spark session**

In [16]:
# Stop the spark session.
spark.stop()