<a href="https://colab.research.google.com/github/faithrts/Science_Explainers/blob/main/science_explainer_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
### importing libraries

# basic libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# to download files
from google.colab import files

import re
import codecs

# sklearn libraries for ML
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Importing article URL database

In [2]:
### cloning git repo

!git clone https://github.com/faithrts/Science_Explainers

Cloning into 'Science_Explainers'...
remote: Enumerating objects: 513, done.[K
remote: Counting objects: 100% (407/407), done.[K
remote: Compressing objects: 100% (359/359), done.[K
remote: Total 513 (delta 68), reused 367 (delta 47), pack-reused 106[K
Receiving objects: 100% (513/513), 1.36 MiB | 9.29 MiB/s, done.
Resolving deltas: 100% (71/71), done.


In [3]:
### saving csv file of URLs into dataframe

explainer_df = pd.read_csv('Science_Explainers/science_explainers_database.csv')

In [4]:
explainer_df

Unnamed: 0,FILENAME,TITLE,SOURCE,DATE PUBLISHED,URL
0,ATLANTIC/HowToSuccessfullySmashYourFace.txt,How to Successfully Smash Your Face Against a ...,ATLANTIC,2022-07-14,https://www.theatlantic.com/science/archive/20...
1,ATLANTIC/WillCovidsSpringLullLast.txt,Will COVID’s Spring Lull Last?,ATLANTIC,2023-05-01,https://www.theatlantic.com/science/archive/20...
2,ATLANTIC/TeenBrainsArePerfectlyCapable.txt,Teen Brains Are Perfectly Capable,ATLANTIC,2023-04-30,https://www.theatlantic.com/science/archive/20...
3,ATLANTIC/TheFishHadGillsFullOf.txt,The Fish Had Gills Full of Ash and Gas Bubblin...,ATLANTIC,2023-04-29,https://www.theatlantic.com/science/archive/20...
4,ATLANTIC/LushPrairiesCouldReallyBeGreen.txt,Lush Prairies Could Really Be ‘Green Deserts’,ATLANTIC,2023-04-23,https://www.theatlantic.com/science/archive/20...
...,...,...,...,...,...
355,REUTERS/PlasticEnteringOceansCouldNearlyTriple...,Plastic entering oceans could nearly triple by...,REUTERS,2023-03-08,https://www.reuters.com/business/environment/p...
356,REUTERS/FukushimaWastewaterReleaseWouldHaveLim...,Fukushima wastewater release would have limite...,REUTERS,2023-02-16,https://www.reuters.com/world/asia-pacific/fuk...
357,REUTERS/MoreThanHalfOfTheWorlds.txt,More than half of the world's large lakes are ...,REUTERS,2023-05-18,https://www.reuters.com/business/environment/m...
358,REUTERS/AmputeesCouldFeelWarmthOfHuman.txt,Amputees could feel warmth of human touch with...,REUTERS,2023-05-18,https://www.reuters.com/business/healthcare-ph...


## Helper functions

In [5]:
def load_article_content(explainer_df):

  # adds new column to the dataframe
  explainer_df['TEXT'] = ''

  for index, row in explainer_df.iterrows():
    path = 'Science_Explainers/txt_files/'
    cur_filename = row['FILENAME']
    cur_article = codecs.open(path + cur_filename, 'r', encoding = 'utf8').read()

    # saving the text in the dataframe
    explainer_df.at[index, 'TEXT'] = cur_article

  return explainer_df


In [6]:
### custom pre-processor to eliminte numbers and instances of "_", "\", and "—"

def my_preprocessor(text):
    text = text.lower()
    text = re.sub('([0-9—_\\\\])', '', text)
    return text

In [7]:
### assumes the text content is in a column called 'TEXT'
def add_dtm(explainer_df):

  # using CountVectorizer to make a DTM based on the words in the corpus
  vectorizer = CountVectorizer(input = 'content', preprocessor = my_preprocessor, stop_words = 'english', min_df = 5, encoding = 'utf8')
  dtm = vectorizer.fit_transform(explainer_df['TEXT'])
  words = vectorizer.get_feature_names_out()
  matrix = dtm.toarray()

  # combining the DTM with the metadata
  DTM = pd.DataFrame(matrix, columns = words)

  # attaching the DTM to the original dataframe
  dtm_both = pd.concat([explainer_df, DTM], axis=1)

  return dtm_both

In [8]:
### assumes the text content is in a column called 'TEXT'
def add_tf_idf(explainer_df):

  # using TfidfVectorizer to add the tf-idf values of each word to the dataframe
  vectorizer = TfidfVectorizer(input = 'content', preprocessor = my_preprocessor, stop_words = 'english', min_df = 5, encoding = 'utf8')
  tf_idf = vectorizer.fit_transform(explainer_df['TEXT'])
  words = vectorizer.get_feature_names_out()

  # converting sparse matrix to an array of arrays
  matrix = tf_idf.toarray()

  # combining the tf-idf matrix with the metadata (associated words)
  TF_IDF = pd.DataFrame(matrix, columns = words)

  # attaches the tf-idf to the original dataframe
  tf_idf_both = pd.concat([explainer_df, TF_IDF], axis = 1)

  return tf_idf_both

In [9]:
### assumes the matrix starts after the 5th column
def find_top_10_words(matrix_df):
  # isoltes for the matrix
  matrix = matrix_df.iloc[:, 6:]

  # sums the values, result is a Pandas series
  sum_values = matrix.sum()

  # divides the sums by the number of words
  sum_values = sum_values/len(sum_values)

  return sum_values.sort_values(ascending = False)[0:11]

## Testing

In [10]:
# adding the text of each article as a column in the dataframe
explainer_df = load_article_content(explainer_df)

In [11]:
# adding the dtm to the df
dtm_df = add_dtm(explainer_df)

In [29]:
# adding the tf-idf to the df
tf_idf_df = add_tf_idf(explainer_df)

In [46]:
find_top_10_words(dtm_df)

said           0.251414
study          0.204072
like           0.175634
university     0.170948
new            0.170948
says           0.164809
people         0.158992
researchers    0.157538
species        0.144450
scientists     0.132493
climate        0.117790
dtype: float64

In [47]:
find_top_10_words(tf_idf_df)

said          0.002975
says          0.002206
study         0.001924
species       0.001900
university    0.001521
people        0.001493
reuters       0.001486
like          0.001479
water         0.001401
new           0.001382
climate       0.001350
dtype: float64