In [1]:
import pandas as pd  # Python data analysis library - for dataframe data structure
import requests  # for http requests
from bs4 import BeautifulSoup  # to work with web pages


#load the json data
storiesDf = pd.read_json(r'C:\Users\ewerfel\Documents\notebooks\Domino\Collabsphere\stories.json')

# limit dataframe to 25 rows for time
storiesDf = storiesDf[:35]  

In [2]:
# take a look at the first few rows
storiesDf.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [3]:
#explore current columns 

storiesDf.columns

Index(['link', 'headline', 'category', 'short_description', 'authors', 'date'], dtype='object')

In [4]:
# show links to full articles
storiesDf['link']

0     https://www.huffpost.com/entry/covid-boosters-...
1     https://www.huffpost.com/entry/american-airlin...
2     https://www.huffpost.com/entry/funniest-tweets...
3     https://www.huffpost.com/entry/funniest-parent...
4     https://www.huffpost.com/entry/amy-cooper-lose...
5     https://www.huffpost.com/entry/belk-worker-fou...
6     https://www.huffpost.com/entry/reporter-gets-a...
7     https://www.huffpost.com/entry/puerto-rico-wat...
8     https://www.huffpost.com/entry/mija-documentar...
9     https://www.huffpost.com/entry/biden-un-russia...
10    https://www.huffpost.com/entry/bc-soc-wcup-cap...
11    https://www.huffpost.com/entry/man-sets-fire-p...
12    https://www.huffpost.com/entry/fiona-threatens...
13    https://www.huffpost.com/entry/twitch-streamer...
14    https://www.huffpost.com/entry/virginia-thomas...
15    https://www.huffpost.com/entry/valery-polyakov...
16    https://www.huffpost.com/entry/hulu-reboot-sho...
17    https://www.huffpost.com/entry/dodgers-bas

In [5]:
# function to retrieve article body from page at article URL

def retrieveArticle(articleURL):
    # URL of the web page you want to scrape

# Send a request to fetch the web page content
    response = requests.get(articleURL)

# Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

# Find the <article> tag and extract its contents
    article_tag = soup.find('article')
    if article_tag:
        article_content = article_tag.get_text()
    else:
        article_content = 'No article tag found on the page.'
    return article_content

In [7]:
#apply function to all rows in dataframe, and save body of article in new column
storiesDf['articleBody'] = storiesDf['link'].apply(retrieveArticle)

In [8]:
#make sure the column was added
storiesDf.columns

Index(['link', 'headline', 'category', 'short_description', 'authors', 'date',
       'articleBody'],
      dtype='object')

In [9]:
#examine body of stories: 
storiesDf['articleBody'].str[0:200]

0     LOADINGERROR LOADINGU.S. health officials say ...
1     A volatile passenger has been charged with a f...
2     LOADINGERROR LOADINGWoof — it’s been a long, l...
3     LOADINGERROR LOADINGKids may say the darndest ...
4     LOADINGERROR LOADINGA white woman who said tha...
5     A woman hired to clean the public bathroom of ...
6     Robert Tilearcio Jr.'s surprise TV proposal to...
7     LOADINGERROR LOADINGCAGUAS, Puerto Rico (AP) —...
8     Musician and music manager Doris Muñoz dances ...
9     LOADINGERROR LOADINGNEW YORK (AP) — President ...
10    FRANKFURT AM MAIN, GERMANY - SEPTEMBER 21: Thi...
11    LOADINGERROR LOADINGTOKYO (AP) — A man set him...
12    LOADINGERROR LOADINGSAN JUAN, Puerto Rico (AP)...
13    LOADINGERROR LOADINGA Twitch streamer’s claim ...
14    LOADINGERROR LOADINGVirginia “Ginni” Thomas, t...
15    Polyakov being assisted mere moments before li...
16    A still from Hulu's "RebootILLUSTRATION: JIANA...
17    19 April 2015: Former Los Angeles Dodgers 

In [10]:
storiesDf.describe()

Unnamed: 0,link,headline,category,short_description,authors,date,articleBody
count,35,35,35,35,35.0,35,35
unique,35,35,11,35,26.0,7,35
top,https://www.huffpost.com/entry/virginia-thomas...,The Funniest Tweets From Parents This Week (Se...,WORLD NEWS,An annual celebration took on a different feel...,,2022-09-20 00:00:00,"LOADINGERROR LOADINGKYIV, Ukraine (AP) — Russi..."
freq,1,1,11,1,4.0,6,1
first,,,,,,2022-09-17 00:00:00,
last,,,,,,2022-09-23 00:00:00,


In [11]:
import win32com.client   # For COM objects

# get NotesSession through COM
notesSession = win32com.client.Dispatch(r'Notes.NOTESSESSION')


In [12]:
# Get a handle to the database
articlesDb = notesSession.GetDatabase("", r"collabsphere\articlesempty.nsf")

# Show title to verify we have a database object
articlesDb.Title


'Articles Empty'

In [13]:
# This code creates a document in the articlesDb database for each row in the storiesDf DataFrame.

formName = "Article"  # The name of the form used for the documents.

for index, row in storiesDf.T.items():  # Iterate over the rows of the DataFrame.
    doc = articlesDb.CreateDocument  # Create a new document in the database.
    doc.ReplaceItemValue("Form", formName)  # Set the form name for the document.

    for i in range(len(row)):  # Iterate over the columns of the row.
        if isinstance(row[i], pd.Timestamp):  # Check if the column is a timestamp.
            row[i] = row[i].isoformat()[:10]  # Convert the timestamp to a 10-character string.
        doc.ReplaceItemValue(row.index[i], row[i])  # Set the value of the column in the document.

    doc.Save(1, 1)  # Save the document.