## **Install packages if not yet installed**

In [None]:
import sys

!{sys.executable} -m pip install bs4 # BeautifulSoup
!{sys.executable} -m pip install opendatasets # OpenDatasets
!{sys.executable} -m pip install cassandra-driver # Cassandra

## **Reading the dataset**

**1.** Create a file `kaggle.json` and save your Kaggle username and API key. This will be used to download the dataset from Kaggle.

**2.** The URL of the dataset is [https://www.kaggle.com/datasets/ashishjangra27/geeksforgeeks-articles](https://www.kaggle.com/datasets/ashishjangra27/geeksforgeeks-articles "GeeksForGeeks Articles Dataset"). Using `opendatasets` package, download the dataset. Step 1 is required in order for this to automatically take in your username and API key.

**3.** Read the downloaded dataset.

In [1]:
import json
import opendatasets as od
import pandas as pd
import os

In [2]:
# Creating kaggle.json file. I previous saved my Kaggle username and API key as environment variables.
with open("kaggle.json", "w") as kaggleFile:
    kaggleFile.write(json.dumps({"username": os.environ["kaggleUsername"], "key": os.environ["kaggleAPIKey"]}))

In [3]:
# Downloading the dataset.
od.download("https://www.kaggle.com/datasets/ashishjangra27/geeksforgeeks-articles")

Downloading geeksforgeeks-articles.zip to ./geeksforgeeks-articles


100%|██████████| 1.31M/1.31M [00:00<00:00, 10.8MB/s]







In [4]:
# Reading the dataset.
articles=pd.read_csv(r"geeksforgeeks-articles/articles.csv")
articles.head()

Unnamed: 0,title,author_id,last_updated,link,category
0,5 Best Practices For Writing SQL Joins,priyankab14,"21 Feb, 2022",https://www.geeksforgeeks.org/5-best-practices...,easy
1,Foundation CSS Dropdown Menu,ishankhandelwals,"20 Feb, 2022",https://www.geeksforgeeks.org/foundation-css-d...,easy
2,Top 20 Excel Shortcuts That You Need To Know,priyankab14,"17 Feb, 2022",https://www.geeksforgeeks.org/top-20-excel-sho...,easy
3,Servlet – Fetching Result,nishatiwari1719,"17 Feb, 2022",https://www.geeksforgeeks.org/servlet-fetching...,easy
4,Suffix Sum Array,rohit768,"21 Feb, 2022",https://www.geeksforgeeks.org/suffix-sum-array/,easy


In [5]:
articles.shape

(34574, 5)

## **Dropping rows with null values**

In [6]:
articles=articles.dropna()

In [7]:
# Reset index.
articles=articles.reset_index().drop("index", axis=1)
articles.head()

Unnamed: 0,title,author_id,last_updated,link,category
0,5 Best Practices For Writing SQL Joins,priyankab14,"21 Feb, 2022",https://www.geeksforgeeks.org/5-best-practices...,easy
1,Foundation CSS Dropdown Menu,ishankhandelwals,"20 Feb, 2022",https://www.geeksforgeeks.org/foundation-css-d...,easy
2,Top 20 Excel Shortcuts That You Need To Know,priyankab14,"17 Feb, 2022",https://www.geeksforgeeks.org/top-20-excel-sho...,easy
3,Servlet – Fetching Result,nishatiwari1719,"17 Feb, 2022",https://www.geeksforgeeks.org/servlet-fetching...,easy
4,Suffix Sum Array,rohit768,"21 Feb, 2022",https://www.geeksforgeeks.org/suffix-sum-array/,easy


In [8]:
articles.shape

(34551, 5)

## **Connect to Amazon Keyspaces**

In [9]:
from cassandra.cluster import Cluster, ExecutionProfile, EXEC_PROFILE_DEFAULT
from ssl import SSLContext, PROTOCOL_TLSv1_2, CERT_REQUIRED
from cassandra.auth import PlainTextAuthProvider
from cassandra.query import SimpleStatement
from cassandra import ConsistencyLevel

In [10]:
# Service username and password for AWS Keyspaces. I previous saved my Keyspace credentials as environment variables.
username=os.environ["keyspacesCredentialUsername"]
password=os.environ["keyspacesCredentialPassword"]

In [11]:
# Creates a session connection to the keyspace that is secured by TLS.
ssl_context=SSLContext(PROTOCOL_TLSv1_2)
ssl_context.load_verify_locations('../sf-class2-root.crt')
ssl_context.verify_mode=CERT_REQUIRED
exec_profile=ExecutionProfile(consistency_level=ConsistencyLevel.LOCAL_QUORUM)
auth_provider=PlainTextAuthProvider(username=username, password=password)

cluster=Cluster(['cassandra.us-east-2.amazonaws.com'], 
                ssl_context=ssl_context, 
                auth_provider=auth_provider, 
                execution_profiles={EXEC_PROFILE_DEFAULT: exec_profile}, 
                port=9142)
session=cluster.connect()

  ssl_context=SSLContext(PROTOCOL_TLSv1_2)


## **Scrap text from the URL to get article content**

**1.** Create a new column `text` to store the scrapped text using BeautifulSoup.

**2.** Define the function to scrap text given the URL as a parameter.

**3.** In batches of 1024, use multi-threading to call this function for each row and save each row of the dataframe in Amazon Keyspaces.

In [13]:
from bs4 import BeautifulSoup
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

In [14]:
# Add new column to save the scrapped text from the URLs.
articles["text"]=""
articles.head()

Unnamed: 0,title,author_id,last_updated,link,category,text
0,5 Best Practices For Writing SQL Joins,priyankab14,"21 Feb, 2022",https://www.geeksforgeeks.org/5-best-practices...,easy,
1,Foundation CSS Dropdown Menu,ishankhandelwals,"20 Feb, 2022",https://www.geeksforgeeks.org/foundation-css-d...,easy,
2,Top 20 Excel Shortcuts That You Need To Know,priyankab14,"17 Feb, 2022",https://www.geeksforgeeks.org/top-20-excel-sho...,easy,
3,Servlet – Fetching Result,nishatiwari1719,"17 Feb, 2022",https://www.geeksforgeeks.org/servlet-fetching...,easy,
4,Suffix Sum Array,rohit768,"21 Feb, 2022",https://www.geeksforgeeks.org/suffix-sum-array/,easy,


In [15]:
# Dictionary to save the errors occurred while scrapping text.
scrapTextErrors={}

In [16]:
# Set timeout.
TIMEOUT_SECS=60

In [17]:
# Define a function to scrap text.
def scrapText(i, link):
    try:
        page=requests.get(link).text
        parser=BeautifulSoup(page, "html.parser")

        # Get the inner HTML of <div class="text"></div> tag. This consists of the main content.
        # Instead of recursively finding this tag with the above class name, I'm going iteratively to avoid max recursion errors.
        parser=parser.find("html", recursive=False)
        parser=parser.find("body", recursive=False)
        parser=parser.find("div", id="main", recursive=False)
        parser=parser.find("div", id="home-page", recursive=False)
        parser=parser.find("div", class_="article-page_flex", recursive=False)
        parser=parser.find("div", class_="leftBar", recursive=False)
        parser=parser.find("div", class_="article--viewer", recursive=False)
        parser=parser.find("div", class_="article--viewer_content", recursive=False)
        parser=parser.find("div", class_="a-wrapper", recursive=False)
        parser=parser.find("article", recursive=False)
        
        text=[""]
        for tag in parser.find("div", class_="text", recursive=False).contents:
            # Ignore all the <div> tags inside <div class="text"></div> as they do not have any
            # main content.
            if tag.name!="div":
                text.append(" ".join(tag.stripped_strings))
        # Return the main content.
        return i, "\n".join(text).strip("\n")
    
    except Exception as err:
        scrapTextErrors[i]={"link": link, "error": err}
    return i, ""

In [18]:
%%time
# Run the above function for all the links in batches using multithreading.
futureResultErrors=[]
batchesCount, BATCH_SIZE=0, 1024
# Print batch size
print(f"Batch size: {BATCH_SIZE}")

for batch_start in range(0, articles.shape[0], BATCH_SIZE):
    future_to_url={}
    batchesCount+=1 # Batch number of the current batch.
    countEmptyText=0 # Count of empty `text` in the current batch.
    batch_end=batch_start+BATCH_SIZE if batch_start+BATCH_SIZE<articles.shape[0] else articles.shape[0]

    with ThreadPoolExecutor(max_workers=128) as executor: 
        for i in range(batch_start, batch_end):
            future_to_url[executor.submit(scrapText, i, articles.loc[i, "link"])]=i
            
        for future in as_completed(future_to_url):
            try:
                i, text=future.result(timeout=TIMEOUT_SECS)
                articles.loc[i, "text"]=str(text)

                # Insert in Amazon Keyspaces.
                session.execute(session.prepare(f'INSERT INTO "GFGArticles"."GFGArticles" '
                                                f'("ID", "Title", "Category", "Link", "LastUpdated", "AuthorID", "Content") '
                                                f'VALUES (?, ?, ?, ?, ?, ?, ?);'), 
                                parameters=[i, 
                                            articles.loc[i, "title"],
                                            articles.loc[i, "category"],
                                            articles.loc[i, "link"],
                                            articles.loc[i, "last_updated"],
                                            articles.loc[i, "author_id"],
                                            articles.loc[i, "text"]])

                # If `text` is empty, update count.
                if text=="":
                    countEmptyText+=1
            except Exception as err:
                futureResultErrors.append(err)
    
    # Print status.
    print(f"Batch #{batchesCount}: Extracted `text` for {(batch_end-batch_start)-countEmptyText} links")
    # Empty text for this batch.
    articles.loc[batch_start:batch_end, "text"]=""

Batch size: 1024
Batch #1: Extracted `text` for 1023 links
Batch #2: Extracted `text` for 1024 links
Batch #3: Extracted `text` for 1024 links
Batch #4: Extracted `text` for 1024 links
Batch #5: Extracted `text` for 1023 links
Batch #6: Extracted `text` for 1023 links
Batch #7: Extracted `text` for 1024 links
Batch #8: Extracted `text` for 1023 links
Batch #9: Extracted `text` for 1022 links
Batch #10: Extracted `text` for 1022 links
Batch #11: Extracted `text` for 1024 links
Batch #12: Extracted `text` for 1024 links
Batch #13: Extracted `text` for 1024 links
Batch #14: Extracted `text` for 1024 links
Batch #15: Extracted `text` for 1024 links
Batch #16: Extracted `text` for 1022 links
Batch #17: Extracted `text` for 1024 links
Batch #18: Extracted `text` for 1023 links
Batch #19: Extracted `text` for 1023 links
Batch #20: Extracted `text` for 1024 links
Batch #21: Extracted `text` for 1021 links
Batch #22: Extracted `text` for 1023 links
Batch #23: Extracted `text` for 1023 links
Bat

## **See errors**

In [20]:
# Add the futureResultErrors to the scrapTextErrors.
scrapTextErrors["futureResult"]=futureResultErrors
scrapTextErrors

{537: {'link': 'https://www.geeksforgeeks.org/geek-o-lympics-2021-let-the-fun-begin/',
  'error': AttributeError("'NoneType' object has no attribute 'find'")},
 8716: {'link': 'https://www.geeksforgeeks.org/process-synchronization/',
  'error': AttributeError("'NoneType' object has no attribute 'find'")},
 9621: {'link': 'https://www.geeksforgeeks.org/operating-systems-set-2/',
  'error': AttributeError("'NoneType' object has no attribute 'find'")},
 10087: {'link': 'https://www.geeksforgeeks.org/geek-o-lympics-sale-courses-at-pocket-friendly-prices/',
  'error': AttributeError("'NoneType' object has no attribute 'find'")},
 17570: {'link': 'https://www.geeksforgeeks.org/data-structures-hash-question-4/',
  'error': AttributeError("'NoneType' object has no attribute 'find'")},
 19289: {'link': 'https://www.geeksforgeeks.org/how-to-calculate-and-plot-the-derivative-of-a-function-using-python-matplotlib/',
  'error': AttributeError("'NoneType' object has no attribute 'find'")},
 26343: {

The above errors occurred because those links do not have any content.