<a href="https://colab.research.google.com/github/faithrts/Science_Explainers/blob/main/dataset/science_explainer_dataset_update.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
### importing libraries

# basic libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# libraries for web scraping
from bs4 import BeautifulSoup
import requests
import re
import codecs

# to download files
from google.colab import files

# Importing article URLs

In [2]:
### cloning git repo and saving csv file of URLs into dataframe

!git clone https://github.com/faithrts/Science_Explainers

Cloning into 'Science_Explainers'...
remote: Enumerating objects: 1828, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 1828 (delta 18), reused 6 (delta 2), pack-reused 1791[K
Receiving objects: 100% (1828/1828), 93.97 MiB | 15.20 MiB/s, done.
Resolving deltas: 100% (806/806), done.
Updating files: 100% (17/17), done.


In [3]:
### unzips txt files

!unzip Science_Explainers/dataset/science_txt_files.zip

Archive:  Science_Explainers/dataset/science_txt_files.zip
   creating: science_txt_files/
   creating: science_txt_files/NPR/
  inflating: science_txt_files/NPR/EliLillyReleasesMoreDataFor.txt  
  inflating: science_txt_files/NPR/IsGrayHairReversibleANew.txt  
  inflating: science_txt_files/NPR/ClimateChangeMakesWildfiresInCalifornia.txt  
  inflating: science_txt_files/NPR/GenericAbortionPillManufacturerSuesFda.txt  
  inflating: science_txt_files/NPR/BikeRidingInMiddleSchoolMay.txt  
  inflating: science_txt_files/NPR/CaliforniasDestructivelyWetWinterHasA.txt  
  inflating: science_txt_files/NPR/HammerheadSharksHoldTheirBreathIn.txt  
  inflating: science_txt_files/NPR/SomeOfCanadasWildfiresLikelyMade.txt  
  inflating: science_txt_files/NPR/RenewableEnergyIsHereButHow.txt  
  inflating: science_txt_files/NPR/FloridaLawmakersWantToUseRadioactive.txt  
  inflating: science_txt_files/NPR/QaThisScientistDevelopedASoap.txt  
  inflating: science_txt_files/NPR/AGlacierBabyIsBornMating.tx

In [4]:
### saving csv file of URLs into dataframe

urls_df = pd.read_csv('article_urls.csv')

# replaces all NaN instances with 0
urls_df = urls_df.fillna('')

# Webscraping helper functions

In [5]:
header_ex = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}

In [121]:
def get_soup(url, header = ''):
  if header == '':
    header = header_ex

  response = requests.get(url, headers = header)

  soup = BeautifulSoup(response.text, 'html.parser')
  return soup

In [108]:
def create_txt_file(soup, filename):

  source = re.search('(.*)(?=\/)', filename)[0]

  find_all = {'ATLANTIC': 'soup.findAll("p")',
              'CBC': 'soup.find("div", {"class": "story"}).findAll("p")',
              'CNN': 'soup.findAll("p")',
              'GLOBE_AND_MAIL': 'soup.findAll("p")',
              'MASSIVE_SCI': 'soup.find("div", {"class": "bodytext"}).findAll("p")',
              'NATIONAL_GEOGRAPHIC': 'soup.findAll("p")',
              'NATIONAL_OBSERVER': 'soup.findAll("p")',
              'NPR': 'soup.find("div", {"class": "image_data"}).find_all_next("p", {"class": None})',
              'REUTERS': 'soup.find("div", {"class": "article-body__content__17Yit"}).findAll("p")'}

  dont_keep = ['read transcribed audio', '\neditor\'s note','sign up for', 'this article was published', 'this story was originally published', 'sponsor']

  # VERY weird egde case
  if source == 'NPR':
    try:
      passages = eval(find_all[source])
    except:
      passages = soup.find("span", {"class": "credit"}).find_all_next("p", {"class": None})

  else:
    # looks for the text of the story
    passages = eval(find_all[source])

  # if, for some reason, there's no text at this url
  if len(passages) == 0:
    return False

  # else, creates a new file
  cur_file = open('science_txt_files/' + filename, 'w+')

  first_line = True

  # iterates through each passage in the article by finding <p> tags
  for passage in passages:

    # extracts the text
    text = passage.get_text()

    # fixing spacing
    text = text.replace(u'\xa0', u' ')
    text = text.replace(u'  ', u' ')

    if any(substring in text.lower() for substring in dont_keep):
      continue
    if source == 'REUTERS' and first_line:
      text = re.sub('.*(?<=\(Reuters\) - )', '', text)
      first_line = False

    # adding a newline before the next passage
    text += '\n'

    # writing the text to the current file
    cur_file.write(text)

  cur_file.close()

  return True

In [8]:
def source_finder(url):
  # testing default source finder
  source = re.search('(?<=https:\/\/www\.)(.*?)(=?\.)', url).group(1)

  # if no source found, tests another link format
  if len(source) == 0:
    source = re.search('(?<=https:\/\/)(.*?)(=?\.)', url).group(1)

  return source

In [9]:
def title_cleaner(title):
  # removes punctuation
  title = re.sub(r'[^\w\s]', '', title)
  return ''.join(title.title().split()[:6])

In [10]:
def title_finder(soup, source):
  # testing default title finder
  title = soup.findAll('h1')

  # if the title is empty, test another title finder
  if len(title) == 0:
    # testing another title finder
    title = soup.findAll('title')

  title = title[0].get_text()
  title = title.replace('\n', '')

  # if the title is not empty, return it
  if len(title) != 0:
    return title

  # another title finder format for CNN articles
  if source == 'cnn':
    title = soup.findAll('h1', {'class': 'headline__text inline-placeholder'})[0].get_text()
    title = title.replace('  ', '')
    title = title.replace('\n', '')

    return title

In [11]:
def add_text_column(df):
  # adding a column for the text contents
  df['TEXT'] = ''

  for index, row in df.iterrows():
    filename = row['FILENAME']
    text = codecs.open('txt_files/' + filename, "r", encoding='utf8').read()

    df.at[index, 'TEXT'] = text

  return df

In [12]:
def update_dataset(urls_df, dataset):

  # the urls currently in the dataset
  preexisting_urls = dataset['URL'].tolist()

  new_dataset = dataset.copy()

  # the urls that are added to the new dataset
  new_urls = []

  # iterating through each column of the df, which translates to each source
  # of science explainers
  for source in urls_df.columns:

    print(source)

    # iterating through the rows of the current column of the df, which
    # translates to the article urls from the current source
    for index, row in urls_df[source].items():

      url = row

      # skips empty urls and urls already in the dataset
      if url == '' or url in preexisting_urls:
        continue

      # if the url isn't already in the dataset, creates a new entry in the dataset
      new_urls.append(url)

      # gets the website content
      r = requests.get(url)
      soup = BeautifulSoup(r.content, 'html.parser')

      # extracts the title of the article
      title = title_finder(soup, source)

      # edit the title for the filename (title case and only the first 5 words)
      title_cut = title_cleaner(title)

      # creates a new file, writes entire article to it, then saves in the txt_files folder
      filename = source.replace(' ', '_') + '/' + title_cut + '.txt'
      create_txt_file(soup, filename)

      print('\t' + filename)

      # retrieves the date of publication
      date = all_date_finder(source, soup)

      # adds a row to the science explainer dataset with the info of this article
      new_row = pd.DataFrame({'FILENAME': filename,
                              'TITLE': title,
                              'SOURCE': source,
                              'DATE PUBLISHED': date,
                              'URL': url},
                             index = [0])
      #dataset = dataset.append(new_row, ignore_index=True)
      new_dataset = pd.concat([new_dataset, new_row], ignore_index = True)

  # returns the updated dataset along with the urls added to it
  return new_dataset, new_urls

## Date finder functions

In [13]:
def basic_date_finder(soup):
  date_bunch = soup.select_one('time')
  date = re.search('(?<= datetime=")(.*?)(=?T)', str(date_bunch)).group(1)
  return date

In [14]:
def atlantic_date_finder(soup):
  return basic_date_finder(soup)

In [15]:
def cbc_date_finder(soup):
  return basic_date_finder(soup)

In [16]:
def cnn_date_finder(soup):
  date_bunch = soup.find('link', {'rel': 'canonical'})
  date = re.search('(?<=cnn\.com\/)(\d\d\d\d\/\d\d\/\d\d)(=?)', str(date_bunch))

  # edge case
  if date == None:
    soup_as_string = str(soup)
    date = re.search('(?<=publish_date: \\\')(.*?)(=?\\\')', soup_as_string)

  date = date.group(1)
  date = date.replace('/', '-')

  return date

In [17]:
def globeandmail_date_finder(soup):
  return basic_date_finder(soup)

In [18]:
def massivesci_date_finder(soup):
  return basic_date_finder(soup)

In [34]:
def nationalgeographic_date_finder(soup):
  soup_as_string = str(soup)
  date = re.search('(?<="pbDt":")(.*?)(=?T)', soup_as_string)

  # edge case
  if date == None:
    soup_as_string = str(soup)
    date = re.search('(?<=datePublished":")(.*?)(=?T)', soup_as_string)

  date = date.group(1)

  return date

In [20]:
def nationalobserver_date_finder(soup):
  soup_as_string = str(soup)
  date = re.search('(?<="datePublished": ")(.*?)(=?T)', soup_as_string).group(1)
  return date

In [45]:
def npr_date_finder(soup):
  date_bunch = soup.find('link', {'rel': 'canonical'})
  date = re.search('(\d\d\d\d\/\d\d\/\d\d)', str(date_bunch))

  # edge case
  if date == None:
    soup_as_string = str(soup)
    date = re.search('(?<=datePublished":")(.*?)(=?T)', soup_as_string)

  date = date.group(1)
  date = date.replace('/', '-')

  return date

In [22]:
def reuters_date_finder(soup):
  date_bunch = soup.find('link', {'rel': 'canonical'})
  date = re.search('(\d\d\d\d-\d\d-\d\d)', str(date_bunch)).group(1)
  return date

In [23]:
def all_date_finder(source_name, soup):
  # the name of the specific source's date finder function name
  finder_func = source_name.lower().replace(' ', '') + '_date_finder'

  # calls the specific source's date finder function
  return (eval(finder_func)(soup))

In [24]:
def date_finder_tester(source_name, urls_df):
  # gets the urls from source source_name
  urls = urls_df[source_name]

  # the name of the specific source's date finder function name
  finder_func = source_name.lower().replace(' ', '') + '_date_finder'

  # iterates through the url list
  for url in urls:

    # skip empty urls
    if url == '':
      continue

    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')

    # prints the date found
    print(eval(finder_func)(soup))

# Testing

In [25]:
### the dataset before adding new data
science_explainers_df = pd.read_csv('Science_Explainers/dataset/science_explainers_dataset.csv')

In [26]:
science_explainers_df

Unnamed: 0,FILENAME,TITLE,SOURCE,DATE PUBLISHED,URL
0,ATLANTIC/15DegreesWasNeverTheEnd.txt,1.5 Degrees Was Never the End of the World,ATLANTIC,2023-02-01,https://www.theatlantic.com/science/archive/20...
1,ATLANTIC/ABasicPremiseOfAnimalConservation.txt,A Basic Premise of Animal Conservation Looks S...,ATLANTIC,2023-02-22,https://www.theatlantic.com/science/archive/20...
2,ATLANTIC/AClearIndicationThatClimateChange.txt,A Clear Indication That Climate Change Is Burn...,ATLANTIC,2023-06-12,https://www.theatlantic.com/science/archive/20...
3,ATLANTIC/ACognitiveRevolutionInAnimalResearch.txt,A Cognitive Revolution in Animal Research,ATLANTIC,2023-03-19,https://www.theatlantic.com/science/archive/20...
4,ATLANTIC/ACrucialBarrierAgainstHurricanesIs.txt,A Crucial Barrier Against Hurricanes Is at Risk,ATLANTIC,2023-10-01,https://www.theatlantic.com/science/archive/20...
...,...,...,...,...,...
895,REUTERS/WhoLaunchesMrnaVaccineHubIn.txt,WHO launches mRNA vaccine hub in Cape Town,REUTERS,2023-04-20,https://www.reuters.com/business/healthcare-ph...
896,REUTERS/WithAGulpAndBurpA.txt,"With a gulp and burp, a bloated star swallows ...",REUTERS,2023-05-03,https://www.reuters.com/technology/space/with-...
897,REUTERS/WorldsBiggestPermafrostCraterInRussias...,World's biggest permafrost crater in Russia’s ...,REUTERS,2023-07-21,https://www.reuters.com/business/environment/w...
898,REUTERS/YouThinkYouNeedMoreSleep.txt,You think you need more sleep? Tell that to an...,REUTERS,2023-04-21,https://www.reuters.com/lifestyle/science/you-...


In [None]:
### updating the dataset
science_explainers_new_df, new_urls = update_dataset(urls_df, science_explainers_df)

In [110]:
'''
TESTING
temp = pd.DataFrame(urls_df['REUTERS'])
science_explainers_new_df, new_urls = update_dataset(temp, science_explainers_df)
'''

In [None]:
### the added files
new_urls

['https://www.theatlantic.com/science/archive/2023/06/nasa-ufos-conspiracy-theories/674259/',
 'https://www.theatlantic.com/science/archive/2023/06/spaying-sterilizing-cat-nonsurgical-injection/674300/',
 'https://www.theatlantic.com/science/archive/2023/06/angel-shark-sightings-conservation-canary-island-beaches/674284/',
 'https://www.theatlantic.com/science/archive/2023/06/species-unknown-dark-extinction-rate/674282/',
 'https://www.theatlantic.com/science/archive/2023/05/where-are-the-ev-charging-stations/674241/',
 'https://www.theatlantic.com/science/#:~:text=These%20Animals%20Shouldn%E2%80%99t%20Be%20Alive%2C%20Much%20Less%20Sprinting',
 'https://www.theatlantic.com/science/archive/2023/05/airport-endangered-wildlife-conservation-management-safety/674238/',
 'https://www.theatlantic.com/science/archive/2023/05/color-agnosia-condition-causes-blue-strawberries/674224/',
 'https://www.theatlantic.com/science/archive/2023/05/kelp-species-seed-bank-preservation/674217/',
 'https://ww

# Downloading files

In [None]:
### downloading science explainers dataset

science_explainers_new_df.to_csv('science_explainers_dataset.csv', index = False)
files.download('science_explainers_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
urls_df.to_csv('article_urls.csv', index = False)
files.download('article_urls.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
### downloading txt files

!zip -r txt_files.zip Science_Explainers/txt_files
files.download("txt_files.zip")

  adding: Science_Explainers/txt_files/ (stored 0%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/ (stored 0%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/HowNewZealandsPeskyPigsBecame.txt (deflated 57%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/HowLightPollutionIsUpendingThe.txt (deflated 54%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/AnishinaabeArtistReinventsTheLoungeChair.txt (deflated 52%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/MeetTheDogsSniffingStinkyMussels.txt (deflated 53%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/EscootersSilentMenaceOrGreenGodsend.txt (deflated 57%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/OfficialsHidSignificantHealthRisksOf.txt (deflated 56%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/ExtinctPolynesianSnailsMakeTriumphantReturn.txt (deflated 52%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/InMichiganProspectsGloomyForSolar.txt

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>