<a href="https://colab.research.google.com/github/faithrts/Science_Explainers/blob/main/dataset/science_explainer_dataset_update.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
### importing libraries

# basic libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# libraries for web scraping
from bs4 import BeautifulSoup
import requests
import re
import codecs

# to download files
from google.colab import files

# Importing article URLs

In [2]:
### cloning git repo and saving csv file of URLs into dataframe

!git clone https://github.com/faithrts/Science_Explainers

Cloning into 'Science_Explainers'...
remote: Enumerating objects: 1903, done.[K
remote: Counting objects: 100% (112/112), done.[K
remote: Compressing objects: 100% (82/82), done.[K
remote: Total 1903 (delta 66), reused 52 (delta 25), pack-reused 1791[K
Receiving objects: 100% (1903/1903), 97.79 MiB | 7.13 MiB/s, done.
Resolving deltas: 100% (854/854), done.
Updating files: 100% (20/20), done.


In [3]:
### unzips txt files

!unzip Science_Explainers/dataset/science_txt_files.zip

Archive:  Science_Explainers/dataset/science_txt_files.zip
   creating: science_txt_files/
   creating: science_txt_files/CBC/
  inflating: science_txt_files/CBC/NewStudyProposesToUncoverWhere.txt  
  inflating: science_txt_files/CBC/StudentsSayMisinformationAboundsOnlineExperts.txt  
  inflating: science_txt_files/CBC/AlaskapoxDeathBringsAttentionToSmall.txt  
  inflating: science_txt_files/CBC/AlbertasOilsandsPumpOutMorePollutants.txt  
  inflating: science_txt_files/CBC/ScientistsCanDetectAnimalSpeciesBy.txt  
  inflating: science_txt_files/CBC/FungiMayNotZombifyYouBut.txt  
  inflating: science_txt_files/CBC/NewDiscoveriesShowEarlyHumansLived.txt  
  inflating: science_txt_files/CBC/SnowDayNowEclipseDayWhy.txt  
  inflating: science_txt_files/CBC/GulfOfStLawrenceRedfishPopulation.txt  
  inflating: science_txt_files/CBC/EuropesJuiceMissionWillGetUs.txt  
  inflating: science_txt_files/CBC/EasternOntarioFarmerDiscoversNewVariety.txt  
  inflating: science_txt_files/CBC/Environmental

In [4]:
### saving csv file of URLs into dataframe

urls_df = pd.read_csv('Science_Explainers/dataset/article_urls.csv')
# urls_df = pd.read_csv('article_urls.csv')

# replaces all NaN instances with 0
urls_df = urls_df.fillna('')

# Webscraping helper functions

In [5]:
header_ex = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}

In [6]:
def get_soup(url, header = ''):
  if header == '':
    header = header_ex

  response = requests.get(url, headers = header)

  soup = BeautifulSoup(response.text, 'html.parser')
  return soup

In [7]:
def create_txt_file(soup, filename):

  source = re.search('(.*)(?=\/)', filename)[0]

  find_all = {'ATLANTIC': 'soup.findAll("p")',
              'CBC': 'soup.find("div", {"class": "story"}).findAll("p")',
              'CNN': 'soup.findAll("p")',
              'GLOBE_AND_MAIL': 'soup.findAll("p")',
              'MASSIVE_SCI': 'soup.find("div", {"class": "bodytext"}).findAll("p")',
              'NATIONAL_GEOGRAPHIC': 'soup.findAll("p")',
              'NATIONAL_OBSERVER': 'soup.findAll("p")',
              'NPR': 'soup.find("div", {"class": "image_data"}).find_all_next("p", {"class": None})',
              'REUTERS': 'soup.find("div", {"class": "article-body__content__17Yit"}).findAll("p")'}

  dont_keep = ['read transcribed audio', '\neditor\'s note','sign up for', 'this article was published', 'this story was originally published', 'sponsor']

  # VERY weird egde case
  if source == 'NPR':
    try:
      passages = eval(find_all[source])
    except:
      passages = soup.find("span", {"class": "credit"}).find_all_next("p", {"class": None})

  else:
    # looks for the text of the story
    passages = eval(find_all[source])

  # if, for some reason, there's no text at this url
  if len(passages) == 0:
    return False

  # else, creates a new file
  cur_file = open('science_txt_files/' + filename, 'w+')

  first_line = True

  # iterates through each passage in the article by finding <p> tags
  for passage in passages:

    # extracts the text
    text = passage.get_text()

    # fixing spacing
    text = text.replace(u'\xa0', u' ')
    text = text.replace(u'  ', u' ')

    if any(substring in text.lower() for substring in dont_keep):
      continue
    if source == 'REUTERS' and first_line:
      text = re.sub('.*(?<=\(Reuters\) - )', '', text)
      first_line = False

    # adding a newline before the next passage
    text += '\n'

    # writing the text to the current file
    cur_file.write(text)

  cur_file.close()

  return True

In [8]:
def create_txt_file_reuters(soup, filename, verbose = False):

  cur_file = open('science_txt_files/' + filename, 'w+')
  all_text = ''

  first = True

  for result in soup.findAll('div', {'data-testid': not None}):

    cur_text = result.text

    if cur_text == '':
      continue

    if first:
      remove_reuters = re.search('(?<=\(Reuters\) - ).*', cur_text)

      if remove_reuters != None:
        cur_text = remove_reuters[0]

      first = False

      all_text += cur_text

    elif 'Advertisement · Scroll'not in cur_text and 'Purchase Licensing Rights' not in cur_text:
      all_text += '\n' + cur_text

  if verbose:
    print(all_text)

  cur_file.write(all_text)
  cur_file.close()

In [9]:
def source_finder(url):
  # testing default source finder
  source = re.search('(?<=https:\/\/www\.)(.*?)(=?\.)', url).group(1)

  # if no source found, tests another link format
  if len(source) == 0:
    source = re.search('(?<=https:\/\/)(.*?)(=?\.)', url).group(1)

  return source

In [10]:
def title_cleaner(title):
  # removes punctuation
  title = re.sub(r'[^\w\s]', '', title)
  return ''.join(title.title().split()[:6])

In [11]:
def title_finder(soup, source):
  # testing default title finder
  title = soup.findAll('h1')

  # if the title is empty, test another title finder
  if len(title) == 0:
    # testing another title finder
    title = soup.findAll('title')

  title = title[0].get_text()
  title = title.replace('\n', '')

  # if the title is not empty, return it
  if len(title) != 0:
    return title

  # another title finder format for CNN articles
  if source == 'cnn':
    title = soup.findAll('h1', {'class': 'headline__text inline-placeholder'})[0].get_text()
    title = title.replace('  ', '')
    title = title.replace('\n', '')

    return title

In [12]:
def add_text_column(df):
  # adding a column for the text contents
  df['TEXT'] = ''

  for index, row in df.iterrows():
    filename = row['FILENAME']
    text = codecs.open('txt_files/' + filename, "r", encoding='utf8').read()

    df.at[index, 'TEXT'] = text

  return df

In [13]:
def update_dataset(urls_df, dataset):

  # the urls currently in the dataset
  preexisting_urls = dataset['URL'].tolist()

  new_dataset = dataset.copy()

  # the urls that are added to the new dataset
  new_urls = []

  # iterating through each column of the df, which translates to each source
  # of science explainers
  for source in urls_df.columns:

    source = source.replace(' ', '_')

    print(source)

    # iterating through the rows of the current column of the df, which
    # translates to the article urls from the current source
    for index, row in urls_df[source].items():

      url = row

      # skips empty urls and urls already in the dataset
      if url == '' or url in preexisting_urls:
        continue

      # if the url isn't already in the dataset, creates a new entry in the dataset
      new_urls.append(url)

      # gets the website content
      r = requests.get(url)
      soup = BeautifulSoup(r.content, 'html.parser')

      # extracts the title of the article
      title = title_finder(soup, source)

      # edit the title for the filename (title case and only the first 5 words)
      title_cut = title_cleaner(title)

      # creates a new file, writes entire article to it, then saves in the txt_files folder
      filename = source.replace(' ', '_') + '/' + title_cut + '.txt'
      create_txt_file(soup, filename)

      print('\t' + filename)

      # retrieves the date of publication
      date = all_date_finder(source, soup)

      # adds a row to the science explainer dataset with the info of this article
      new_row = pd.DataFrame({'FILENAME': filename,
                              'TITLE': title,
                              'SOURCE': source,
                              'DATE PUBLISHED': date,
                              'URL': url},
                             index = [0])
      #dataset = dataset.append(new_row, ignore_index=True)
      new_dataset = pd.concat([new_dataset, new_row], ignore_index = True)

  # returns the updated dataset along with the urls added to it
  return new_dataset, new_urls

In [14]:
def update_dataset_reuters(html_file, url, title, date, df):

  title_cut = title_cleaner(title)

  # creates a new file, writes entire article to it, then saves in the txt_files folder
  filename = 'REUTERS/' + title_cut + '.txt'

  print('\n' + filename + '\n')

  with open(html_file) as temp_file:
    lines = temp_file.readlines()
    lines_as_str = '\n'.join(lines)

    soup = BeautifulSoup(lines_as_str, "html.parser")

  create_txt_file_reuters(soup, filename, True)

  # if this is already an entry
  if filename in df['FILENAME'].tolist():
    return df

  # adds a row to the science explainer dataset with the info of this article
  new_row = pd.DataFrame({'FILENAME': filename,
                              'TITLE': title,
                              'SOURCE': 'REUTERS',
                              'DATE PUBLISHED': date,
                              'URL': url},
                             index = [0])


  new_dataset = pd.concat([df, new_row], ignore_index = True)

  # returns the updated dataset along with the urls added to it
  return new_dataset

## Date finder functions

In [15]:
def basic_date_finder(soup):
  date_bunch = soup.select_one('time')
  date = re.search('(?<= datetime=")(.*?)(=?T)', str(date_bunch)).group(1)
  return date

In [16]:
def atlantic_date_finder(soup):
  return basic_date_finder(soup)

In [17]:
def cbc_date_finder(soup):
  return basic_date_finder(soup)

In [18]:
def cnn_date_finder(soup):
  date_bunch = soup.find('link', {'rel': 'canonical'})
  date = re.search('(?<=cnn\.com\/)(\d\d\d\d\/\d\d\/\d\d)(=?)', str(date_bunch))

  # edge case
  if date == None:
    soup_as_string = str(soup)
    date = re.search('(?<=publish_date: \\\')(.*?)(=?\\\')', soup_as_string)

  date = date.group(1)
  date = date.replace('/', '-')

  return date

In [19]:
def globe_and_mail_date_finder(soup):
  return basic_date_finder(soup)

In [20]:
def massive_sci_date_finder(soup):
  return basic_date_finder(soup)

In [21]:
def national_geographic_date_finder(soup):
  soup_as_string = str(soup)
  date = re.search('(?<="pbDt":")(.*?)(=?T)', soup_as_string)

  # edge case
  if date == None:
    soup_as_string = str(soup)
    date = re.search('(?<=datePublished":")(.*?)(=?T)', soup_as_string)

  date = date.group(1)

  return date

In [22]:
def national_observer_date_finder(soup):
  soup_as_string = str(soup)
  date = re.search('(?<="datePublished": ")(.*?)(=?T)', soup_as_string).group(1)
  return date

In [23]:
def npr_date_finder(soup):
  date_bunch = soup.find('link', {'rel': 'canonical'})
  date = re.search('(\d\d\d\d\/\d\d\/\d\d)', str(date_bunch))

  # edge case
  if date == None:
    soup_as_string = str(soup)
    date = re.search('(?<=datePublished":")(.*?)(=?T)', soup_as_string)

  date = date.group(1)
  date = date.replace('/', '-')

  return date

In [24]:
def reuters_date_finder(soup):
  date_bunch = soup.find('link', {'rel': 'canonical'})
  date = re.search('(\d\d\d\d-\d\d-\d\d)', str(date_bunch)).group(1)
  return date

In [25]:
def all_date_finder(source_name, soup):
  # the name of the specific source's date finder function name
  finder_func = source_name.lower() + '_date_finder'

  # calls the specific source's date finder function
  return (eval(finder_func)(soup))

# Testing

In [26]:
### the dataset before adding new data

science_explainers_df = pd.read_csv('Science_Explainers/dataset/science_explainers_dataset.csv')

In [27]:
### making a copy of urls_df without the REUTERS column (due to paywall issues)

urls_no_reuters_df = urls_df.iloc[:, 0:-1]

In [28]:
### updating the dataset
science_explainers_df, new_urls = update_dataset(urls_no_reuters_df, science_explainers_df)

ATLANTIC
CBC
	CBC/AllEyesAreOnTheFuture.txt
	CBC/ScientistsTryToUnravelTheCase.txt
	CBC/CanRetrainingTheBrainHelpSilence.txt
	CBC/WhatAreTheChancesOfClear.txt
	CBC/LotsOfNewAnimalsAreHeading.txt
	CBC/WhosToBlameForContaminatedShellfish.txt
CNN
	CNN/ItsADolphinResearchersUncoverUnusual.txt
	CNN/MysteryOfCommonMushroomGrowingFrom.txt
GLOBE_AND_MAIL
MASSIVE_SCI
NATIONAL_GEOGRAPHIC
NATIONAL_OBSERVER
	NATIONAL_OBSERVER/NightfallOnceGaveWildlandFireCrews.txt
	NATIONAL_OBSERVER/DetergentPodsAreJustTheStart.txt
	NATIONAL_OBSERVER/EmissionsFromCarsAndPowerPlants.txt
NPR
	NPR/TheColoradoRiverRarelyReachesThe.txt


In [98]:
### the added files
new_urls

['https://www.cbc.ca/radio/quirks/future-freshwater-1.7152292',
 'https://www.cbc.ca/radio/asithappens/brain-preservation-study-1.7151672',
 'https://www.cbc.ca/radio/thecurrent/tinnitus-treatment-research-1.7100837',
 'https://www.cbc.ca/news/science/total-solar-eclipse-weather-1.7152346',
 'https://www.cbc.ca/news/science/urban-wildlife-climate-1.7156253',
 'https://www.cbc.ca/news/canada/british-columbia/contaminated-shellfish-fecal-testing-1.7155464',
 'https://www.cnn.com/2024/03/30/world/amazon-dolphin-science-newsletter-wt-scn/index.html',
 'https://www.cnn.com/2024/03/02/world/frog-mushroom-science-newsletter-wt-scn/index.html',
 'https://www.nationalobserver.com/2024/03/13/news/nightfall-once-gave-wildland-fire-crews-break-thats-over-now',
 'https://www.nationalobserver.com/2024/03/15/news/detergent-pods-are-just-start-clothing-microplastic-pollution-problem',
 'https://www.nationalobserver.com/2024/03/14/news/emissions-can-hinder-insects-search-plants-pollinate',
 'https://ww

# Adding Reuters articles manually

In [None]:
### unzip extra txt files of html

!unzip extra_content.zip

In [None]:
### iterating through each additional reuters URL

last_30 = urls_df['REUTERS'].tolist()[-30:]

for i in range(1, 31):
  cur_url = last_30[i - 1]

  title = ' '.join(re.search('(.*?)(?=\d\d\d\d-\d\d-\d\d)', cur_url)[0].split('/')[-1].split('-')).strip(' ').title()
  date = re.search('\d\d\d\d-\d\d-\d\d', cur_url)[0]

  txt_title = f'extra_content/reuters_{i}.txt'

  science_explainers_df = update_dataset_reuters(txt_title, cur_url, title, date, science_explainers_df)

In [None]:
science_explainers_df

# Maintenance

In [94]:
import os

def check_files(df):

  dataset_filenames = df['FILENAME'].tolist()
  not_in_dataset = []
  not_in_dir = []

  ''' iterates through files in the directory '''
  for sub_folder in os.listdir('science_txt_files'):
    #print(len(os.listdir(f'science_txt_files/{sub_folder}')))

    # iterates through the files in each sub_folder
    for filename in os.listdir(f'science_txt_files/{sub_folder}'):
      if filename == '.ipynb_checkpoints':
        continue

      if f'{sub_folder}/{filename}' not in dataset_filenames:
        not_in_dataset.append(f'{sub_folder}/{filename}')

  # deletes files from the directory if they aren't in the dataframe
  print('Deleting from directory')
  for filename in not_in_dataset:
    print('\t' + filename)
    os.remove(f'science_txt_files/{filename}')

  ''' iterates through files in the dataset DataFrame '''
  for filename in dataset_filenames:
    # if the file is not in the directory
    if not os.path.exists(f'science_txt_files/{filename}'):
      not_in_dir.append(filename)

  print('Need to add the following files to the directory:')
  for filename in not_in_dir:
    print('\t' + filename)

In [84]:
def count_txt_files_in_dir():
   # iterates through the source folders in the main folder
    for sub_folder in os.listdir('science_txt_files'):
      print(len(os.listdir(f'science_txt_files/{sub_folder}')))

In [69]:
urls_to_delete = [
    'https://www.cbc.ca/news/canada/edmonton/parks-canada-jasper-first-nations-harvest-1.6999777',
    'https://www.cbc.ca/news/canada/hamilton/islamophobia-canada-health-care-muslim-1.6792148',
    'https://www.cbc.ca/news/canada/kitchener-waterloo/region-of-waterloo-support-staff-1.6854310',
    'https://www.cbc.ca/radio/zebra-saskatoon-zoo-rescue-1.7006670',
    'https://www.cbc.ca/news/canada/saskatchewan/creeping-bellflower-how-to-remove-1.6851398',
    'https://www.cbc.ca/news/politics/zaporizhzhia-nuclear-russia-ukraine-iaea-1.6841423',
    'https://www.cnn.com/2023/06/01/business/nhtsa-emergency-braking/index.html',
    'https://www.cnn.com/2023/10/26/health/british-health-agency-warns-of-fake-ozempic-pens-linked-to-hospitalizations/index.html',
    'https://www.nationalobserver.com/2023/02/24/analysis/gasoline-versus-electric-cars-life-cycle-emissions-compared-canada',
    'https://www.nationalobserver.com/2023/05/03/news/e-scooters-silent-menace-or-green-godsend',
    'https://www.nationalobserver.com/2023/05/04/news/anishinaabe-artist-reinvents-lounge-chair-place-gathering',
    'https://www.npr.org/2023/04/19/1170806176/abortion-pill-mifepristone-supreme-court-fda-generic-genbiopro'
]

# Downloading files

In [99]:
### re-sorts df to group entries together based on source

science_explainers_df = science_explainers_df.sort_values(['SOURCE','FILENAME']).reset_index(drop = True)

In [100]:
### downloading science explainers dataset

science_explainers_df.to_csv('science_explainers_dataset.csv', index = False)
files.download('science_explainers_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# urls_df.to_csv('article_urls.csv', index = False)
# files.download('article_urls.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [101]:
### downloading txt files

!zip -r science_txt_files.zip science_txt_files
files.download("science_txt_files.zip")

  adding: science_txt_files/ (stored 0%)
  adding: science_txt_files/CBC/ (stored 0%)
  adding: science_txt_files/CBC/BatsGrowlLikeDeathMetalSingers.txt (deflated 55%)
  adding: science_txt_files/CBC/MarkingsOnTheLegAndButt.txt (deflated 54%)
  adding: science_txt_files/CBC/StudyRevealsSeriousCancerResearchGaps.txt (deflated 54%)
  adding: science_txt_files/CBC/WhosToBlameForContaminatedShellfish.txt (deflated 52%)
  adding: science_txt_files/CBC/RiseInExtremeWildfiresLinkedDirectly.txt (deflated 55%)
  adding: science_txt_files/CBC/HowDoPolarBearsEatWhen.txt (deflated 53%)
  adding: science_txt_files/CBC/TimeInSpaceIsBadFor.txt (deflated 52%)
  adding: science_txt_files/CBC/NewStudySupportsAlternativesToRadiation.txt (deflated 51%)
  adding: science_txt_files/CBC/GeneeditedPigKidneyKeepsMonkeyAlive.txt (deflated 53%)
  adding: science_txt_files/CBC/FossilsPaintThePictureOfGorillasized.txt (deflated 52%)
  adding: science_txt_files/CBC/ThisBcValleyHasBecomeA.txt (deflated 52%)
  adding

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>