<a href="https://colab.research.google.com/github/faithrts/Science_Explainers/blob/main/science_explainer_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [17]:
### importing libraries

# basic libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# to download files
from google.colab import files

# sklearn libraries for ML
import re
import codecs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Importing article URLs

In [2]:
### cloning git repo

!git clone https://github.com/faithrts/Science_Explainers

Cloning into 'Science_Explainers'...
remote: Enumerating objects: 510, done.[K
remote: Counting objects: 100% (404/404), done.[K
remote: Compressing objects: 100% (356/356), done.[K
remote: Total 510 (delta 66), reused 367 (delta 47), pack-reused 106[K
Receiving objects: 100% (510/510), 1.36 MiB | 10.29 MiB/s, done.
Resolving deltas: 100% (69/69), done.


In [3]:
### saving csv file of URLs into dataframe

explainer_df = pd.read_csv('Science_Explainers/science_explainers_database.csv')

In [4]:
explainer_df

Unnamed: 0,FILENAME,TITLE,SOURCE,DATE PUBLISHED,URL
0,ATLANTIC/HowToSuccessfullySmashYourFace.txt,How to Successfully Smash Your Face Against a ...,ATLANTIC,2022-07-14,https://www.theatlantic.com/science/archive/20...
1,ATLANTIC/WillCovidsSpringLullLast.txt,Will COVID’s Spring Lull Last?,ATLANTIC,2023-05-01,https://www.theatlantic.com/science/archive/20...
2,ATLANTIC/TeenBrainsArePerfectlyCapable.txt,Teen Brains Are Perfectly Capable,ATLANTIC,2023-04-30,https://www.theatlantic.com/science/archive/20...
3,ATLANTIC/TheFishHadGillsFullOf.txt,The Fish Had Gills Full of Ash and Gas Bubblin...,ATLANTIC,2023-04-29,https://www.theatlantic.com/science/archive/20...
4,ATLANTIC/LushPrairiesCouldReallyBeGreen.txt,Lush Prairies Could Really Be ‘Green Deserts’,ATLANTIC,2023-04-23,https://www.theatlantic.com/science/archive/20...
...,...,...,...,...,...
355,REUTERS/PlasticEnteringOceansCouldNearlyTriple...,Plastic entering oceans could nearly triple by...,REUTERS,2023-03-08,https://www.reuters.com/business/environment/p...
356,REUTERS/FukushimaWastewaterReleaseWouldHaveLim...,Fukushima wastewater release would have limite...,REUTERS,2023-02-16,https://www.reuters.com/world/asia-pacific/fuk...
357,REUTERS/MoreThanHalfOfTheWorlds.txt,More than half of the world's large lakes are ...,REUTERS,2023-05-18,https://www.reuters.com/business/environment/m...
358,REUTERS/AmputeesCouldFeelWarmthOfHuman.txt,Amputees could feel warmth of human touch with...,REUTERS,2023-05-18,https://www.reuters.com/business/healthcare-ph...


## Helper functions

In [13]:
def load_article_content(explainer_df):

  # adds new column to the dataframe
  explainer_df['TEXT'] = ''

  for index, row in explainer_df.iterrows():
    path = 'Science_Explainers/txt_files/'
    cur_filename = row['FILENAME']
    cur_article = codecs.open(path + cur_filename, 'r', encoding = 'utf8').read()

    # saving the text in the dataframe
    explainer_df.at[index, 'TEXT'] = cur_article

  return explainer_df


In [16]:
### custom pre-processor to eliminte numbers and instances of "_", "\", and "—"

def my_preprocessor(text):
    text = text.lower()
    text = re.sub('([0-9—_\\\\])', '', text)
    return text

In [18]:
### assumes the text content is in a column called 'TEXT'
def add_dtm(explainer_df):
  # using CountVectorizer to make a DTM based on the words in the corpus
  vectorizer = CountVectorizer(input='content', preprocessor=my_preprocessor, stop_words='english', min_df=5, encoding='utf8')
  dtm = vectorizer.fit_transform(explainer_df['TEXT'])
  vocab = vectorizer.get_feature_names_out()
  matrix = dtm.toarray()

  # combining the DTM with the metadata
  DTM = pd.DataFrame(matrix, columns=vocab)

  # attaching the DTM to the original dataframe
  dtm_both = pd.concat([explainer_df, DTM], axis=1)

  return dtm_both

## Testing

In [14]:
explainer_df = load_article_content(explainer_df)

In [19]:
explainer_df = add_dtm(explainer_df)

In [21]:
explainer_df

Unnamed: 0,FILENAME,TITLE,SOURCE,DATE PUBLISHED,URL,TEXT,abandoned,abilities,ability,able,...,zone,zones,zoo,zoological,zoologist,zoology,zoom,zoos,önder,örtel
0,ATLANTIC/HowToSuccessfullySmashYourFace.txt,How to Successfully Smash Your Face Against a ...,ATLANTIC,2022-07-14,https://www.theatlantic.com/science/archive/20...,A new study refutes the widespread idea that w...,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
1,ATLANTIC/WillCovidsSpringLullLast.txt,Will COVID’s Spring Lull Last?,ATLANTIC,2023-05-01,https://www.theatlantic.com/science/archive/20...,Things look calm right now. They may even stay...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ATLANTIC/TeenBrainsArePerfectlyCapable.txt,Teen Brains Are Perfectly Capable,ATLANTIC,2023-04-30,https://www.theatlantic.com/science/archive/20...,Teenagers have plenty of cognitive control. Th...,0,1,5,2,...,0,0,0,0,0,0,0,0,0,0
3,ATLANTIC/TheFishHadGillsFullOf.txt,The Fish Had Gills Full of Ash and Gas Bubblin...,ATLANTIC,2023-04-29,https://www.theatlantic.com/science/archive/20...,How volcanoes kill fish is something out of a ...,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,ATLANTIC/LushPrairiesCouldReallyBeGreen.txt,Lush Prairies Could Really Be ‘Green Deserts’,ATLANTIC,2023-04-23,https://www.theatlantic.com/science/archive/20...,Climate change is stripping plants of their nu...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,REUTERS/PlasticEnteringOceansCouldNearlyTriple...,Plastic entering oceans could nearly triple by...,REUTERS,2023-03-08,https://www.reuters.com/business/environment/p...,[1/2] A plastic bottle is seen floating in an ...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
356,REUTERS/FukushimaWastewaterReleaseWouldHaveLim...,Fukushima wastewater release would have limite...,REUTERS,2023-02-16,https://www.reuters.com/world/asia-pacific/fuk...,"SEOUL, Feb 16 (Reuters) - The release of waste...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
357,REUTERS/MoreThanHalfOfTheWorlds.txt,More than half of the world's large lakes are ...,REUTERS,2023-05-18,https://www.reuters.com/business/environment/m...,"LONDON, May 18 (Reuters) - More than half of t...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
358,REUTERS/AmputeesCouldFeelWarmthOfHuman.txt,Amputees could feel warmth of human touch with...,REUTERS,2023-05-18,https://www.reuters.com/business/healthcare-ph...,"GENEVA, May 18 (Reuters) - Fabrizio Fidati, wh...",0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
