<a href="https://colab.research.google.com/github/faithrts/Science_Explainers/blob/main/dataset/science_explainer_dataset_update.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
### importing libraries

# basic libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# libraries for web scraping
from bs4 import BeautifulSoup
import requests
import re
import codecs

# to download files
from google.colab import files

# Importing article URLs

In [2]:
### cloning git repo and saving csv file of URLs into dataframe

!git clone https://github.com/faithrts/Science_Explainers

Cloning into 'Science_Explainers'...
remote: Enumerating objects: 1832, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 1832 (delta 21), reused 5 (delta 2), pack-reused 1791[K
Receiving objects: 100% (1832/1832), 93.97 MiB | 17.06 MiB/s, done.
Resolving deltas: 100% (809/809), done.
Updating files: 100% (17/17), done.


In [3]:
### unzips txt files

#!unzip Science_Explainers/dataset/science_txt_files.zip

!unzip science_txt_files.zip

Archive:  science_txt_files.zip
   creating: science_txt_files/
   creating: science_txt_files/ATLANTIC/
   creating: science_txt_files/ATLANTIC/.ipynb_checkpoints/
  inflating: science_txt_files/ATLANTIC/ASimpleSolutionForKeepingMicroplastics.txt  
  inflating: science_txt_files/ATLANTIC/TheGreatUnderappreciatedDriverOfClimate.txt  
  inflating: science_txt_files/ATLANTIC/TheThresholdAtWhichSnowStarts.txt  
  inflating: science_txt_files/ATLANTIC/AnchorageFellInLoveWithA.txt  
  inflating: science_txt_files/ATLANTIC/OneOfEvolutionsBiggestMomentsWas.txt  
  inflating: science_txt_files/ATLANTIC/CaliforniasClimateHasComeUnmoored.txt  
  inflating: science_txt_files/ATLANTIC/OneMoreReasonToHateCockroaches.txt  
  inflating: science_txt_files/ATLANTIC/WhereNotToLookForAlien.txt  
  inflating: science_txt_files/ATLANTIC/TattoosDoOddThingsToThe.txt  
  inflating: science_txt_files/ATLANTIC/TheQuestToBuildABetter.txt  
  inflating: science_txt_files/ATLANTIC/TheBirdThatTookAHuman.txt  
  inf

In [6]:
### saving csv file of URLs into dataframe

urls_df = pd.read_csv('article_urls.csv')

# replaces all NaN instances with 0
urls_df = urls_df.fillna('')

# Webscraping helper functions

In [4]:
header_ex = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}

In [5]:
def get_soup(url, header = ''):
  if header == '':
    header = header_ex

  response = requests.get(url, headers = header)

  soup = BeautifulSoup(response.text, 'html.parser')
  return soup

In [7]:
def create_txt_file(soup, filename):

  source = re.search('(.*)(?=\/)', filename)[0]

  find_all = {'ATLANTIC': 'soup.findAll("p")',
              'CBC': 'soup.find("div", {"class": "story"}).findAll("p")',
              'CNN': 'soup.findAll("p")',
              'GLOBE_AND_MAIL': 'soup.findAll("p")',
              'MASSIVE_SCI': 'soup.find("div", {"class": "bodytext"}).findAll("p")',
              'NATIONAL_GEOGRAPHIC': 'soup.findAll("p")',
              'NATIONAL_OBSERVER': 'soup.findAll("p")',
              'NPR': 'soup.find("div", {"class": "image_data"}).find_all_next("p", {"class": None})',
              'REUTERS': 'soup.find("div", {"class": "article-body__content__17Yit"}).findAll("p")'}

  dont_keep = ['read transcribed audio', '\neditor\'s note','sign up for', 'this article was published', 'this story was originally published', 'sponsor']

  # VERY weird egde case
  if source == 'NPR':
    try:
      passages = eval(find_all[source])
    except:
      passages = soup.find("span", {"class": "credit"}).find_all_next("p", {"class": None})

  else:
    # looks for the text of the story
    passages = eval(find_all[source])

  # if, for some reason, there's no text at this url
  if len(passages) == 0:
    return False

  # else, creates a new file
  cur_file = open('science_txt_files/' + filename, 'w+')

  first_line = True

  # iterates through each passage in the article by finding <p> tags
  for passage in passages:

    # extracts the text
    text = passage.get_text()

    # fixing spacing
    text = text.replace(u'\xa0', u' ')
    text = text.replace(u'  ', u' ')

    if any(substring in text.lower() for substring in dont_keep):
      continue
    if source == 'REUTERS' and first_line:
      text = re.sub('.*(?<=\(Reuters\) - )', '', text)
      first_line = False

    # adding a newline before the next passage
    text += '\n'

    # writing the text to the current file
    cur_file.write(text)

  cur_file.close()

  return True

In [173]:
def create_txt_file_reuters(soup, filename, verbose = False):

  cur_file = open('science_txt_files/' + filename, 'w+')
  all_text = ''

  first = True

  for result in soup.findAll('div', {'data-testid': not None}):

    cur_text = result.text

    if cur_text == '':
      continue

    if first:
      cur_text = re.search('(?<=\(Reuters\) - ).*', cur_text)[0]
      first = False

      all_text += cur_text

    elif 'Advertisement · Scroll'not in cur_text and 'Purchase Licensing Rights' not in cur_text:
      all_text += '\n' + cur_text

  if verbose:
    print(all_text)

  cur_file.write(all_text)
  cur_file.close()

In [8]:
def source_finder(url):
  # testing default source finder
  source = re.search('(?<=https:\/\/www\.)(.*?)(=?\.)', url).group(1)

  # if no source found, tests another link format
  if len(source) == 0:
    source = re.search('(?<=https:\/\/)(.*?)(=?\.)', url).group(1)

  return source

In [9]:
def title_cleaner(title):
  # removes punctuation
  title = re.sub(r'[^\w\s]', '', title)
  return ''.join(title.title().split()[:6])

In [10]:
def title_finder(soup, source):
  # testing default title finder
  title = soup.findAll('h1')

  # if the title is empty, test another title finder
  if len(title) == 0:
    # testing another title finder
    title = soup.findAll('title')

  title = title[0].get_text()
  title = title.replace('\n', '')

  # if the title is not empty, return it
  if len(title) != 0:
    return title

  # another title finder format for CNN articles
  if source == 'cnn':
    title = soup.findAll('h1', {'class': 'headline__text inline-placeholder'})[0].get_text()
    title = title.replace('  ', '')
    title = title.replace('\n', '')

    return title

In [11]:
def add_text_column(df):
  # adding a column for the text contents
  df['TEXT'] = ''

  for index, row in df.iterrows():
    filename = row['FILENAME']
    text = codecs.open('txt_files/' + filename, "r", encoding='utf8').read()

    df.at[index, 'TEXT'] = text

  return df

In [12]:
def update_dataset(urls_df, dataset):

  # the urls currently in the dataset
  preexisting_urls = dataset['URL'].tolist()

  new_dataset = dataset.copy()

  # the urls that are added to the new dataset
  new_urls = []

  # iterating through each column of the df, which translates to each source
  # of science explainers
  for source in urls_df.columns:

    print(source)

    # iterating through the rows of the current column of the df, which
    # translates to the article urls from the current source
    for index, row in urls_df[source].items():

      url = row

      # skips empty urls and urls already in the dataset
      if url == '' or url in preexisting_urls:
        continue

      # if the url isn't already in the dataset, creates a new entry in the dataset
      new_urls.append(url)

      # gets the website content
      r = requests.get(url)
      soup = BeautifulSoup(r.content, 'html.parser')

      # extracts the title of the article
      title = title_finder(soup, source)

      # edit the title for the filename (title case and only the first 5 words)
      title_cut = title_cleaner(title)

      # creates a new file, writes entire article to it, then saves in the txt_files folder
      filename = source.replace(' ', '_') + '/' + title_cut + '.txt'
      create_txt_file(soup, filename)

      print('\t' + filename)

      # retrieves the date of publication
      date = all_date_finder(source, soup)

      # adds a row to the science explainer dataset with the info of this article
      new_row = pd.DataFrame({'FILENAME': filename,
                              'TITLE': title,
                              'SOURCE': source,
                              'DATE PUBLISHED': date,
                              'URL': url},
                             index = [0])
      #dataset = dataset.append(new_row, ignore_index=True)
      new_dataset = pd.concat([new_dataset, new_row], ignore_index = True)

  # returns the updated dataset along with the urls added to it
  return new_dataset, new_urls

In [190]:
def update_dataset_reuters(html_file, url, title, date, df):

  title_cut = title_cleaner(title)

  # creates a new file, writes entire article to it, then saves in the txt_files folder
  filename = 'REUTERS/' + title_cut + '.txt'

  print(filename + '\n')

  with open(html_file) as temp_file:
    lines = temp_file.readlines()
    lines_as_str = '\n'.join(lines)

    soup = BeautifulSoup(lines_as_str, "html.parser")

  create_txt_file_reuters(soup, filename, True)

  # if this is already an entry
  if filename in df['FILENAME'].tolist():
    return df

  # adds a row to the science explainer dataset with the info of this article
  new_row = pd.DataFrame({'FILENAME': filename,
                              'TITLE': title,
                              'SOURCE': 'REUTERS',
                              'DATE PUBLISHED': date,
                              'URL': url},
                             index = [0])


  new_dataset = pd.concat([df, new_row], ignore_index = True)

  # returns the updated dataset along with the urls added to it
  return new_dataset

## Date finder functions

In [13]:
def basic_date_finder(soup):
  date_bunch = soup.select_one('time')
  date = re.search('(?<= datetime=")(.*?)(=?T)', str(date_bunch)).group(1)
  return date

In [14]:
def atlantic_date_finder(soup):
  return basic_date_finder(soup)

In [15]:
def cbc_date_finder(soup):
  return basic_date_finder(soup)

In [16]:
def cnn_date_finder(soup):
  date_bunch = soup.find('link', {'rel': 'canonical'})
  date = re.search('(?<=cnn\.com\/)(\d\d\d\d\/\d\d\/\d\d)(=?)', str(date_bunch))

  # edge case
  if date == None:
    soup_as_string = str(soup)
    date = re.search('(?<=publish_date: \\\')(.*?)(=?\\\')', soup_as_string)

  date = date.group(1)
  date = date.replace('/', '-')

  return date

In [17]:
def globeandmail_date_finder(soup):
  return basic_date_finder(soup)

In [18]:
def massivesci_date_finder(soup):
  return basic_date_finder(soup)

In [19]:
def nationalgeographic_date_finder(soup):
  soup_as_string = str(soup)
  date = re.search('(?<="pbDt":")(.*?)(=?T)', soup_as_string)

  # edge case
  if date == None:
    soup_as_string = str(soup)
    date = re.search('(?<=datePublished":")(.*?)(=?T)', soup_as_string)

  date = date.group(1)

  return date

In [20]:
def nationalobserver_date_finder(soup):
  soup_as_string = str(soup)
  date = re.search('(?<="datePublished": ")(.*?)(=?T)', soup_as_string).group(1)
  return date

In [21]:
def npr_date_finder(soup):
  date_bunch = soup.find('link', {'rel': 'canonical'})
  date = re.search('(\d\d\d\d\/\d\d\/\d\d)', str(date_bunch))

  # edge case
  if date == None:
    soup_as_string = str(soup)
    date = re.search('(?<=datePublished":")(.*?)(=?T)', soup_as_string)

  date = date.group(1)
  date = date.replace('/', '-')

  return date

In [22]:
def reuters_date_finder(soup):
  date_bunch = soup.find('link', {'rel': 'canonical'})
  date = re.search('(\d\d\d\d-\d\d-\d\d)', str(date_bunch)).group(1)
  return date

In [23]:
def all_date_finder(source_name, soup):
  # the name of the specific source's date finder function name
  finder_func = source_name.lower().replace(' ', '') + '_date_finder'

  # calls the specific source's date finder function
  return (eval(finder_func)(soup))

In [24]:
def date_finder_tester(source_name, urls_df):
  # gets the urls from source source_name
  urls = urls_df[source_name]

  # the name of the specific source's date finder function name
  finder_func = source_name.lower().replace(' ', '') + '_date_finder'

  # iterates through the url list
  for url in urls:

    # skip empty urls
    if url == '':
      continue

    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')

    # prints the date found
    print(eval(finder_func)(soup))

# Testing

In [25]:
### the dataset before adding new data
#science_explainers_df = pd.read_csv('Science_Explainers/dataset/science_explainers_dataset.csv')

science_explainers_df = pd.read_csv('science_explainers_dataset.csv')

In [None]:
science_explainers_df

In [32]:
### making a copy of urls_df without the REUTERS column (due to paywall issues)

urls_no_reuters_df = urls_df.iloc[:, 0:-1]

In [34]:
### updating the dataset
science_explainers_new_df, new_urls = update_dataset(urls_no_reuters_df, science_explainers_df)

ATLANTIC
	ATLANTIC/APersistentLightningMysteryHasFinally.txt
	ATLANTIC/AnchorageFellInLoveWithA.txt
	ATLANTIC/AnchoviesAreAForceToBe.txt
	ATLANTIC/TheUnthinkableNewRealityAboutBedbugs.txt
	ATLANTIC/TheBirdThatTookAHuman.txt
	ATLANTIC/CaliforniasClimateHasComeUnmoored.txt
	ATLANTIC/OlderAmericansAreAboutToLose.txt
	ATLANTIC/HurricanesAreTooFastForCategory.txt
	ATLANTIC/FluShotsNeedToStopFighting.txt
	ATLANTIC/WhereNotToLookForAlien.txt
	ATLANTIC/NorthernAlaskaIsRunningOutOf.txt
	ATLANTIC/NatureDoesntCareWhereASpecies.txt
	ATLANTIC/DeerAreBetatestingANightmareDisease.txt
	ATLANTIC/AChickenFromHellCouldRewrite.txt
	ATLANTIC/HowOneTinyInsectUpendedAn.txt
	ATLANTIC/IfExerciseCouldCureThisI.txt
	ATLANTIC/PrepareForAGraySwanClimate.txt
	ATLANTIC/ThePowerhouseOfTheCellIs.txt
	ATLANTIC/HowMuchLessToWorryAbout.txt
	ATLANTIC/MilitaryEmissionsAreTooBigTo.txt
	ATLANTIC/PetsReallyCanBeLikeHuman.txt
	ATLANTIC/ASurprisingSuccessStoryForHumpback.txt
	ATLANTIC/AmericaIsHavingASeniorMoment.txt
	ATLANTIC/

In [None]:
'''
TESTING
temp = pd.DataFrame(urls_df['REUTERS'])
science_explainers_new_df, new_urls = update_dataset(temp, science_explainers_df)
'''

In [35]:
### the added files
new_urls

['https://www.theatlantic.com/science/archive/2024/03/lightning-map-america-florida/677622/',
 'https://www.theatlantic.com/science/archive/2024/02/white-raven-alaska-photos/677156/',
 'https://www.theatlantic.com/science/archive/2024/02/anchovy-mating-ocean-turbulence-storm/677413/',
 'https://www.theatlantic.com/science/archive/2024/02/bedbugs-pesticide-resistant-tropical/677415/',
 'https://www.theatlantic.com/science/archive/2024/02/crane-walnut-smithsonian-zoo-conservation/677416/',
 'https://www.theatlantic.com/science/archive/2024/02/los-angeles-rain-climate-change/677377/',
 'https://www.theatlantic.com/health/archive/2024/02/ozempic-weight-loss-older-americans-boomers/677371/',
 'https://www.theatlantic.com/science/archive/2024/02/category-6-hurricanes-saffir-simpson-scale/677354/',
 'https://www.theatlantic.com/health/archive/2024/02/flu-vaccine-pandemic-change-yamagata-trivalent/677350/',
 'https://www.theatlantic.com/science/archive/2024/02/search-for-alien-life-planet-form

# Adding Reuters articles manually

In [None]:
url = 'https://www.reuters.com/technology/space/saturns-death-star-moon-has-hidden-secret-subsurface-ocean-2024-02-07/'

title = ' '.join(re.search('(.*?)(?=\d\d\d\d-\d\d-\d\d)', url)[0].split('/')[-1].split('-')).strip(' ').title()
date = re.search('\d\d\d\d-\d\d-\d\d', url)[0]

In [None]:
science_explainers_df = update_dataset_reuters('temp.txt', url, title, date, science_explainers_df)

# Downloading files

In [37]:
### downloading science explainers dataset

science_explainers_new_df.to_csv('science_explainers_dataset.csv', index = False)
files.download('science_explainers_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [38]:
urls_df.to_csv('article_urls.csv', index = False)
files.download('article_urls.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [41]:
### downloading txt files

!zip -r science_txt_files.zip science_txt_files
files.download("science_txt_files.zip")

  adding: science_txt_files/ (stored 0%)
  adding: science_txt_files/ATLANTIC/ (stored 0%)
  adding: science_txt_files/ATLANTIC/.ipynb_checkpoints/ (stored 0%)
  adding: science_txt_files/ATLANTIC/ASimpleSolutionForKeepingMicroplastics.txt (deflated 55%)
  adding: science_txt_files/ATLANTIC/TheGreatUnderappreciatedDriverOfClimate.txt (deflated 53%)
  adding: science_txt_files/ATLANTIC/TheThresholdAtWhichSnowStarts.txt (deflated 54%)
  adding: science_txt_files/ATLANTIC/AnchorageFellInLoveWithA.txt (deflated 52%)
  adding: science_txt_files/ATLANTIC/OneOfEvolutionsBiggestMomentsWas.txt (deflated 56%)
  adding: science_txt_files/ATLANTIC/CaliforniasClimateHasComeUnmoored.txt (deflated 50%)
  adding: science_txt_files/ATLANTIC/OneMoreReasonToHateCockroaches.txt (deflated 54%)
  adding: science_txt_files/ATLANTIC/WhereNotToLookForAlien.txt (deflated 57%)
  adding: science_txt_files/ATLANTIC/TattoosDoOddThingsToThe.txt (deflated 55%)
  adding: science_txt_files/ATLANTIC/TheQuestToBuildABett

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>