<a href="https://colab.research.google.com/github/faithrts/Science_Explainers/blob/main/dataset/science_explainer_dataset_update.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
### importing libraries

# basic libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# libraries for web scraping
from bs4 import BeautifulSoup
import requests
import re
import codecs

# to donwload files
from google.colab import files

# Importing article URLs

In [None]:
### cloning git repo and saving csv file of URLs into dataframe

#!git clone https://github.com/faithrts/Science_Explainers

In [None]:
### saving csv file of URLs into dataframe

#urls_df = pd.read_csv('Science_Explainers/database/article_urls.csv')

# replaces all NaN instances with 0
#urls_df = urls_df.fillna('')

# Webscraping helper functions

In [None]:
header_ex = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}

In [None]:
def get_soup(url):
  response = requests.get(url, headers = header_ex)
  soup = BeautifulSoup(response.text, 'html.parser')
  return soup

In [None]:
def create_txt_file(soup, filename):

  source = re.search('(.*)(?=\/)', filename)[0]

  find_all = {'ATLANTIC': 'soup.findAll("p")',
              'CBC': 'soup.find("div", {"class": "story"}).findAll("p")',
              'CNN': 'soup.findAll("p")',
              'GLOBE_AND_MAIL': 'soup.findAll("p")',
              'MASSIVE_SCI': 'soup.find("div", {"class": "bodytext"}).findAll("p")',
              'NATIONAL_GEOGRAHIC': 'soup.findAll("p")',
              'NATIONAL_OBSERVER': 'soup.findAll("p")',
              'NPR': 'soup.find("div", {"class": "image_data"}).find_all_next("p", {"class": None})',
              'REUTERS': 'soup.find("div", {"class": "article-body__content__17Yit"}).findAll("p")'}

  dont_keep = ['read transcribed audio', '\neditor\'s note','sign up for', 'this article was published', 'this story was originally published', 'sponsor']

  # looks for the text of the story
  passages = eval(find_all[source])

  # if, for some reason, there's no text at this url
  if len(passages) == 0:
    return False

  # else, creates a new file
  cur_file = open('Science_Explainers/txt_files/' + filename, 'w+')

  first_line = True

  # iterates through each passage in the article by finding <p> tags
  for passage in passages:

    # extracts the text
    text = passage.get_text()

    # fixing spacing
    text = text.replace(u'\xa0', u' ')
    text = text.replace(u'  ', u' ')

    if any(substring in text.lower() for substring in dont_keep):
      continue
    if source == 'REUTERS' and first_line:
      text = re.sub('.*(?<=\(Reuters\) - )', '', text)
      first_line = False

    # adding a newline before the next passage
    text += '\n'

    # writing the text to the current file
    cur_file.write(text)

  cur_file.close()

  return True

In [None]:
def source_finder(url):
  # testing default source finder
  source = re.search('(?<=https:\/\/www\.)(.*?)(=?\.)', url).group(1)

  # if no source found, tests another link format
  if len(source) == 0:
    source = re.search('(?<=https:\/\/)(.*?)(=?\.)', url).group(1)

  return source

In [None]:
def title_cleaner(title):
  # removes punctuation
  title = re.sub(r'[^\w\s]', '', title)
  return ''.join(title.title().split()[:6])

In [None]:
def title_finder(soup, source):
  # testing default title finder
  title = soup.findAll('h1')

  # if the title is empty, test another title finder
  if len(title) == 0:
    # testing another title finder
    title = soup.findAll('title')

  title = title[0].get_text()
  title = title.replace('\n', '')

  # if the title is not empty, return it
  if len(title) != 0:
    return title

  # another title finder format for CNN articles
  if source == 'cnn':
    title = soup.findAll('h1', {'class': 'headline__text inline-placeholder'})[0].get_text()
    title = title.replace('  ', '')
    title = title.replace('\n', '')

    return title

In [None]:
def add_text_column(df):
  # adding a column for the text contents
  df['TEXT'] = ''

  for index, row in df.iterrows():
    filename = row['FILENAME']
    text = codecs.open('txt_files/' + filename, "r", encoding='utf8').read()

    df.at[index, 'TEXT'] = text

  return df

In [None]:
def update_database(urls_df, database):

  # the urls currently in the database
  preexisting_urls = database['URL'].tolist()

  new_database = database.copy()

  # the urls that are added to the new database
  new_urls = []

  # iterating through each column of the df, which translates to each source
  # of science explainers
  for source in urls_df.columns:

    print(source)

    # iterating through the rows of the current column of the df, which
    # translates to the article urls from the current source
    for index, row in urls_df[source].items():

      url = row

      # skips empty urls and urls already in the database
      if url == '' or url in preexisting_urls:
        continue

      # if the url isn't already in the database, creates a new entry in the database
      new_urls.append(url)

      # gets the website content
      r = requests.get(url)
      soup = BeautifulSoup(r.content, 'html.parser')

      # extracts the title of the article
      title = title_finder(soup, source)

      # edit the title for the filename (title case and only the first 5 words)
      title_cut = title_cleaner(title)

      # creates a new file, writes entire article to it, then saves in the txt_files folder
      filename = source.replace(' ', '_') + '/' + title_cut + '.txt'
      create_txt_file(soup, filename)

      print('\t' + filename)

      # retrieves the date of publication
      date = all_date_finder(source, soup)

      # adds a row to the science explainer database with the info of this article
      new_row = pd.DataFrame({'FILENAME': filename,
                              'TITLE': title,
                              'SOURCE': source,
                              'DATE PUBLISHED': date,
                              'URL': url},
                             index = [0])
      #database = database.append(new_row, ignore_index=True)
      new_database = pd.concat([new_database, new_row], ignore_index = True)

  # returns the updated database along with the urls added to it
  return new_database, new_urls

## Date finder functions

In [None]:
def basic_date_finder(soup):
  date_bunch = soup.select_one('time')
  date = re.search('(?<= datetime=")(.*?)(=?T)', str(date_bunch)).group(1)
  return date

In [None]:
def atlantic_date_finder(soup):
  return basic_date_finder(soup)

In [None]:
def cbc_date_finder(soup):
  return basic_date_finder(soup)

In [None]:
def cnn_date_finder(soup):
  date_bunch = soup.find('link', {'rel': 'canonical'})
  date = re.search('(?<=cnn\.com\/)(\d\d\d\d\/\d\d\/\d\d)(=?)', str(date_bunch)).group(1)
  date = date.replace('/', '-')
  return date

In [None]:
def globeandmail_date_finder(soup):
  return basic_date_finder(soup)

In [None]:
def massivesci_date_finder(soup):
  return basic_date_finder(soup)

In [None]:
def nationalgeographic_date_finder(soup):
  soup_as_string = str(soup)
  date = re.search('(?<="pbDt":")(.*?)(=?T)', soup_as_string).group(1)
  return date

In [None]:
def nationalobserver_date_finder(soup):
  soup_as_string = str(soup)
  date = re.search('(?<="datePublished": ")(.*?)(=?T)', soup_as_string).group(1)
  return date

In [None]:
def npr_date_finder(soup):
  date_bunch = soup.find('link', {'rel': 'canonical'})
  date = re.search('(\d\d\d\d\/\d\d\/\d\d)', str(date_bunch)).group(1)
  date = date.replace('/', '-')
  return date

In [None]:
def reuters_date_finder(soup):
  date_bunch = soup.find('link', {'rel': 'canonical'})
  date = re.search('(\d\d\d\d-\d\d-\d\d)', str(date_bunch)).group(1)
  return date

In [None]:
def all_date_finder(source_name, soup):
  # the name of the specific source's date finder function name
  finder_func = source_name.lower().replace(' ', '') + '_date_finder'

  # calls the specific source's date finder function
  return (eval(finder_func)(soup))

In [None]:
def date_finder_tester(source_name, urls_df):
  # gets the urls from source source_name
  urls = urls_df[source_name]

  # the name of the specific source's date finder function name
  finder_func = source_name.lower().replace(' ', '') + '_date_finder'

  # iterates through the url list
  for url in urls:

    # skip empty urls
    if url == '':
      continue

    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')

    # prints the date found
    print(eval(finder_func)(soup))

# Testing

In [None]:
### making a copy of the df without the NYT articles, since they're behind a paywall
urls_no_nyt_df = urls_df.drop(columns = ['NYT'])

In [None]:
### the database before adding new data
science_explainers_df = pd.read_csv('Science_Explainers/database/science_explainers_database.csv')

In [None]:
science_explainers_df

Unnamed: 0,FILENAME,TITLE,SOURCE,DATE PUBLISHED,URL
0,ATLANTIC/HowToSuccessfullySmashYourFace.txt,How to Successfully Smash Your Face Against a ...,ATLANTIC,2022-07-14,https://www.theatlantic.com/science/archive/20...
1,ATLANTIC/WillCovidsSpringLullLast.txt,Will COVID’s Spring Lull Last?,ATLANTIC,2023-05-01,https://www.theatlantic.com/science/archive/20...
2,ATLANTIC/TeenBrainsArePerfectlyCapable.txt,Teen Brains Are Perfectly Capable,ATLANTIC,2023-04-30,https://www.theatlantic.com/science/archive/20...
3,ATLANTIC/TheFishHadGillsFullOf.txt,The Fish Had Gills Full of Ash and Gas Bubblin...,ATLANTIC,2023-04-29,https://www.theatlantic.com/science/archive/20...
4,ATLANTIC/LushPrairiesCouldReallyBeGreen.txt,Lush Prairies Could Really Be ‘Green Deserts’,ATLANTIC,2023-04-23,https://www.theatlantic.com/science/archive/20...
...,...,...,...,...,...
895,REUTERS/RightAgainEinsteinStudyShowsHow.txt,"Right again, Einstein! Study shows how antimat...",REUTERS,2023-09-27,https://www.reuters.com/science/right-again-ei...
896,REUTERS/NasaAsteroidSampleParachutesSafelyOnto...,NASA asteroid sample parachutes safely onto Ut...,REUTERS,2023-09-24,https://www.reuters.com/science/nasas-first-as...
897,REUTERS/MexicoResearchersShowProgressOnDrive.txt,Mexico researchers show progress on drive to r...,REUTERS,2023-09-22,https://www.reuters.com/markets/commodities/me...
898,REUTERS/ZambiaFindShowsHumansHaveBuilt.txt,Zambia find shows humans have built with wood ...,REUTERS,2023-09-20,https://www.reuters.com/science/zambia-find-sh...


In [None]:
### updating the database
science_explainers_new_df, new_urls = update_database(urls_no_nyt_df, science_explainers_df)

ATLANTIC
	ATLANTIC/NasaLearnsTheUglyTruthAbout.txt
	ATLANTIC/SterilizingCatsNoSurgeryRequired.txt
	ATLANTIC/SuperrareAngelSharksAreThrivingIn.txt
	ATLANTIC/DarkExtinctionsArePoppingUpEverywhere.txt
	ATLANTIC/TheOneThingHoldingBackElectric.txt
	ATLANTIC/Science.txt
	ATLANTIC/SomeAnimalsHaveNoChoiceBut.txt
	ATLANTIC/TheBluestrawberryProblem.txt
	ATLANTIC/TheUphillBattleToSaveThe.txt
	ATLANTIC/Even90DegreeHeatCouldBeDevastating.txt
	ATLANTIC/TheArcticAndAtlanticOceansAre.txt
	ATLANTIC/AmericanFoodWillNeverLookNatural.txt
	ATLANTIC/HowRealIsSmokeBrain.txt
	ATLANTIC/TheGrowingBattleOverInfantMilk.txt
	ATLANTIC/AClearIndicationThatClimateChange.txt
	ATLANTIC/NoOneKnowsExactlyWhatSocial.txt
	ATLANTIC/ReboundRelationshipsAreTotallyFine.txt
	ATLANTIC/AmericasMostPopularDrugHasA.txt
	ATLANTIC/ForTheLichens.txt
	ATLANTIC/CompostablePlasticIsGarbage.txt
	ATLANTIC/OpenYourMindToUnicornMeat.txt
	ATLANTIC/IsThereSuchAThingAs.txt
	ATLANTIC/EvenNorthAmericasElkHaveRegional.txt
	ATLANTIC/ThisHurricaneSe

In [None]:
### the added files
new_urls

['https://www.theatlantic.com/science/archive/2023/06/nasa-ufos-conspiracy-theories/674259/',
 'https://www.theatlantic.com/science/archive/2023/06/spaying-sterilizing-cat-nonsurgical-injection/674300/',
 'https://www.theatlantic.com/science/archive/2023/06/angel-shark-sightings-conservation-canary-island-beaches/674284/',
 'https://www.theatlantic.com/science/archive/2023/06/species-unknown-dark-extinction-rate/674282/',
 'https://www.theatlantic.com/science/archive/2023/05/where-are-the-ev-charging-stations/674241/',
 'https://www.theatlantic.com/science/#:~:text=These%20Animals%20Shouldn%E2%80%99t%20Be%20Alive%2C%20Much%20Less%20Sprinting',
 'https://www.theatlantic.com/science/archive/2023/05/airport-endangered-wildlife-conservation-management-safety/674238/',
 'https://www.theatlantic.com/science/archive/2023/05/color-agnosia-condition-causes-blue-strawberries/674224/',
 'https://www.theatlantic.com/science/archive/2023/05/kelp-species-seed-bank-preservation/674217/',
 'https://ww

# Downloading files

In [None]:
### downloading science explainers database

science_explainers_new_df.to_csv('science_explainers_database.csv', index = False)
files.download('science_explainers_database.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
urls_df.to_csv('article_urls.csv', index = False)
files.download('article_urls.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
### downloading txt files

!zip -r txt_files.zip Science_Explainers/txt_files
files.download("txt_files.zip")

  adding: Science_Explainers/txt_files/ (stored 0%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/ (stored 0%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/HowNewZealandsPeskyPigsBecame.txt (deflated 57%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/HowLightPollutionIsUpendingThe.txt (deflated 54%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/AnishinaabeArtistReinventsTheLoungeChair.txt (deflated 52%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/MeetTheDogsSniffingStinkyMussels.txt (deflated 53%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/EscootersSilentMenaceOrGreenGodsend.txt (deflated 57%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/OfficialsHidSignificantHealthRisksOf.txt (deflated 56%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/ExtinctPolynesianSnailsMakeTriumphantReturn.txt (deflated 52%)
  adding: Science_Explainers/txt_files/NATIONAL_OBSERVER/InMichiganProspectsGloomyForSolar.txt

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>