<a href="https://colab.research.google.com/github/faithrts/Science_Explainers/blob/main/science_explainer_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
### importing libraries

# basic libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

# to download files
from google.colab import files

import re
import codecs

# sklearn libraries for ML
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# NLTK
import nltk
from nltk import word_tokenize
from nltk import pos_tag
from nltk.stem import SnowballStemmer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Importing databases

In [2]:
### cloning git repo

!git clone https://github.com/faithrts/Science_Explainers

Cloning into 'Science_Explainers'...
remote: Enumerating objects: 528, done.[K
remote: Counting objects: 100% (422/422), done.[K
remote: Compressing objects: 100% (374/374), done.[K
remote: Total 528 (delta 78), reused 367 (delta 47), pack-reused 106[K
Receiving objects: 100% (528/528), 1.38 MiB | 5.57 MiB/s, done.
Resolving deltas: 100% (81/81), done.


In [3]:
### saving url database into dataframe

explainer_df = pd.read_csv('Science_Explainers/science_explainers_database.csv')

In [4]:
explainer_df

Unnamed: 0,FILENAME,TITLE,SOURCE,DATE PUBLISHED,URL
0,ATLANTIC/HowToSuccessfullySmashYourFace.txt,How to Successfully Smash Your Face Against a ...,ATLANTIC,2022-07-14,https://www.theatlantic.com/science/archive/20...
1,ATLANTIC/WillCovidsSpringLullLast.txt,Will COVID’s Spring Lull Last?,ATLANTIC,2023-05-01,https://www.theatlantic.com/science/archive/20...
2,ATLANTIC/TeenBrainsArePerfectlyCapable.txt,Teen Brains Are Perfectly Capable,ATLANTIC,2023-04-30,https://www.theatlantic.com/science/archive/20...
3,ATLANTIC/TheFishHadGillsFullOf.txt,The Fish Had Gills Full of Ash and Gas Bubblin...,ATLANTIC,2023-04-29,https://www.theatlantic.com/science/archive/20...
4,ATLANTIC/LushPrairiesCouldReallyBeGreen.txt,Lush Prairies Could Really Be ‘Green Deserts’,ATLANTIC,2023-04-23,https://www.theatlantic.com/science/archive/20...
...,...,...,...,...,...
355,REUTERS/PlasticEnteringOceansCouldNearlyTriple...,Plastic entering oceans could nearly triple by...,REUTERS,2023-03-08,https://www.reuters.com/business/environment/p...
356,REUTERS/FukushimaWastewaterReleaseWouldHaveLim...,Fukushima wastewater release would have limite...,REUTERS,2023-02-16,https://www.reuters.com/world/asia-pacific/fuk...
357,REUTERS/MoreThanHalfOfTheWorlds.txt,More than half of the world's large lakes are ...,REUTERS,2023-05-18,https://www.reuters.com/business/environment/m...
358,REUTERS/AmputeesCouldFeelWarmthOfHuman.txt,Amputees could feel warmth of human touch with...,REUTERS,2023-05-18,https://www.reuters.com/business/healthcare-ph...


In [6]:
### unzipping fiction novels data (database and df)

!unzip txt_lab_data.zip
fiction_df = pd.read_csv('txt_lab_data/txt_lab_novels.csv')

Archive:  txt_lab_data.zip
   creating: txt_lab_data/
  inflating: __MACOSX/._txt_lab_data  
  inflating: txt_lab_data/.DS_Store  
  inflating: __MACOSX/txt_lab_data/._.DS_Store  
   creating: txt_lab_data/txt_lab_novels/
  inflating: __MACOSX/txt_lab_data/._txt_lab_novels  
  inflating: txt_lab_data/txt_lab_novels.csv  
  inflating: __MACOSX/txt_lab_data/._txt_lab_novels.csv  
  inflating: txt_lab_data/txt_lab_novels/EN_1800_Edgeworth,Maria_CastleRackrent_Novel.txt  
  inflating: __MACOSX/txt_lab_data/txt_lab_novels/._EN_1800_Edgeworth,Maria_CastleRackrent_Novel.txt  
  inflating: txt_lab_data/txt_lab_novels/EN_1854_Gaskell,Elizabeth_NorthandSouth_Novel.txt  
  inflating: __MACOSX/txt_lab_data/txt_lab_novels/._EN_1854_Gaskell,Elizabeth_NorthandSouth_Novel.txt  
  inflating: txt_lab_data/txt_lab_novels/EN_1860_Collins,Wilkie_TheWomaninWhite_Novel.txt  
  inflating: __MACOSX/txt_lab_data/txt_lab_novels/._EN_1860_Collins,Wilkie_TheWomaninWhite_Novel.txt  
  inflating: txt_lab_data/txt_la

In [7]:
explainer_df

Unnamed: 0,FILENAME,TITLE,SOURCE,DATE PUBLISHED,URL
0,ATLANTIC/HowToSuccessfullySmashYourFace.txt,How to Successfully Smash Your Face Against a ...,ATLANTIC,2022-07-14,https://www.theatlantic.com/science/archive/20...
1,ATLANTIC/WillCovidsSpringLullLast.txt,Will COVID’s Spring Lull Last?,ATLANTIC,2023-05-01,https://www.theatlantic.com/science/archive/20...
2,ATLANTIC/TeenBrainsArePerfectlyCapable.txt,Teen Brains Are Perfectly Capable,ATLANTIC,2023-04-30,https://www.theatlantic.com/science/archive/20...
3,ATLANTIC/TheFishHadGillsFullOf.txt,The Fish Had Gills Full of Ash and Gas Bubblin...,ATLANTIC,2023-04-29,https://www.theatlantic.com/science/archive/20...
4,ATLANTIC/LushPrairiesCouldReallyBeGreen.txt,Lush Prairies Could Really Be ‘Green Deserts’,ATLANTIC,2023-04-23,https://www.theatlantic.com/science/archive/20...
...,...,...,...,...,...
355,REUTERS/PlasticEnteringOceansCouldNearlyTriple...,Plastic entering oceans could nearly triple by...,REUTERS,2023-03-08,https://www.reuters.com/business/environment/p...
356,REUTERS/FukushimaWastewaterReleaseWouldHaveLim...,Fukushima wastewater release would have limite...,REUTERS,2023-02-16,https://www.reuters.com/world/asia-pacific/fuk...
357,REUTERS/MoreThanHalfOfTheWorlds.txt,More than half of the world's large lakes are ...,REUTERS,2023-05-18,https://www.reuters.com/business/environment/m...
358,REUTERS/AmputeesCouldFeelWarmthOfHuman.txt,Amputees could feel warmth of human touch with...,REUTERS,2023-05-18,https://www.reuters.com/business/healthcare-ph...


In [8]:
fiction_df

Unnamed: 0,filename,id,language,date,author,title,gender,person,length
0,"EN_1771_Mackenzie,Henry_TheManofFeeling_Novel.txt",151,English,1771,"Mackenzie,Henry",TheManofFeeling,male,first,36458
1,"EN_1771_Smollett,Tobias_TheExpedictionofHenryC...",152,English,1771,"Smollett,Tobias",TheExpedictionofHenryClinker,male,first,148261
2,"EN_1778_Burney,Fanny_Evelina_Novel.txt",153,English,1778,"Burney,Fanny",Evelina,female,first,154168
3,"EN_1782_Burney,Fanny_Cecilia_Novel.txt",154,English,1782,"Burney,Fanny",Cecilia,female,third,328981
4,"EN_1786_Beckford,William_Vathek_Novel.txt",155,English,1786,"Beckford,William",Vathek,male,third,36077
...,...,...,...,...,...,...,...,...,...
145,"EN_1922_Joyce,James_Ulysses_Novel.txt",296,English,1922,"Joyce,James",Ulysses,male,first,264170
146,"EN_1925_Woolf,Virginia_Mrs.Dalloway_Novel.txt",297,English,1925,"Woolf,Virginia",Mrs.Dalloway,female,third,63367
147,"EN_1927_Woolf,Virginia_TotheLighthouse_Novel.txt",298,English,1927,"Woolf,Virginia",TotheLighthouse,female,third,69276
148,"EN_1928_Woolf,Virginia_Orlando_Novel.txt",299,English,1928,"Woolf,Virginia",Orlando,female,third,78380


## Helper functions

In [9]:
### turns all column names to upper case
def uppercase_columns(df):
  columns = df.columns
  new_columns = [column.upper() for column in columns]
  df.columns = new_columns

In [10]:
### counts the word count of the text and adds it as a column
def count_text_length(df):
  df['LENGTH'] = ''

  for index, row in df.iterrows():
    text = row['TEXT']
    text_length = len(text)
    row['LENGTH'] = text_length

In [11]:
def load_text_content(df, path):

  # adds new column to the dataframe
  df['TEXT'] = ''

  for index, row in df.iterrows():
    cur_filename = row['FILENAME']
    cur_article = codecs.open(path + cur_filename, 'r', encoding = 'utf8').read()

    # saving the text in the dataframe
    df.at[index, 'TEXT'] = cur_article

  return df

In [12]:
### custom pre-processor to eliminte numbers and instances of "_", "\", and "—"
def my_preprocessor(text):
    text = text.lower()
    text = re.sub('([0-9—_\\\\])', '', text)
    return text

In [13]:
### makes all the column names UPPERCASE
def col_names_to_uppercase(df):
  new_columns = [name.upper() for name in df.columns]
  df.columns = new_columns

  return df

In [14]:
def add_dtm(df, focus_col):

  # using CountVectorizer to make a DTM based on the words in the corpus
  vectorizer = CountVectorizer(input = 'content', preprocessor = my_preprocessor, stop_words = 'english', min_df = 5, encoding = 'utf8')
  dtm = vectorizer.fit_transform(df[focus_col])
  words = vectorizer.get_feature_names_out()
  matrix = dtm.toarray()

  # combining the DTM with the metadata
  DTM = pd.DataFrame(matrix, columns = words)

  # attaching the DTM to the original dataframe
  dtm_both = pd.concat([df, DTM], axis=1)

  return dtm_both

In [15]:
### assumes the text content is in a column called 'TEXT'
def add_tf_idf(df):

  # using TfidfVectorizer to add the tf-idf values of each word to the dataframe
  vectorizer = TfidfVectorizer(input = 'content', preprocessor = my_preprocessor, stop_words = 'english', min_df = 5, encoding = 'utf8')
  tf_idf = vectorizer.fit_transform(df['TEXT'])
  words = vectorizer.get_feature_names_out()

  # converting sparse matrix to an array of arrays
  matrix = tf_idf.toarray()

  # combining the tf-idf matrix with the metadata (associated words)
  TF_IDF = pd.DataFrame(matrix, columns = words)

  # attaches the tf-idf to the original dataframe
  tf_idf_both = pd.concat([df, TF_IDF], axis = 1)

  return tf_idf_both

In [16]:
### assumes the POS tags are in a column called 'POS TAGS'
def count_pos_tags(df):
  # concatenates all lists of POS tags into one big lists
  all_tags = df['POS TAGS'].sum()

  # counts each POS tag occurrence
  tag_counts = Counter(all_tags)

  # sorts the POS tags
  sorted_tag_counts = sorted(tag_counts, reverse = True)

  return tag_counts, sorted_tag_counts

In [35]:

### assumes the text content is in a column called 'TEXT'
def add_pos_tags(df):

  new_df = df.copy()

  new_df['POS TAG TOKENS'] = ''
  new_df['POS TAGS'] = ''
  new_df['POS TAGS STRING'] = ''

  for index, row in new_df.iterrows():
    cur_text = row['TEXT']
    tokenized_text = word_tokenize(cur_text)
    POS_tags = pos_tag(tokenized_text)
    tags_only = [tag for word,tag in POS_tags]

    row['POS TAG TOKENS'] = POS_tags
    row['POS TAGS'] = tags_only
    row['POS TAGS STRING'] = ' '.join(tags_only)

  return new_df

In [30]:
### assumes the matrix starts after the 5th column
def sort_top_words(matrix_df, df_type):
  # isolates for the matrix
  if df_type == 'explainer':
    matrix = matrix_df.iloc[:, 7:]
  elif df_type == 'fiction':
    matrix = matrix_df.iloc[:, 10:]
  else:
    return

  # sums the values, result is a Pandas series
  sum_values = matrix.sum()

  # divides the sums by the number of words
  sum_values = sum_values/len(sum_values)

  return sum_values.sort_values(ascending = False)

In [19]:
class StemWords(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

  def fit(self, X, y=None):
    return self

  def transform(self, list_of_passages):
    # initializes the stemmer
    snowball_stemmer = SnowballStemmer('english')
    new_list_of_passages = []

    for passage in list_of_passages:
      # breaks the passage up into its component words
      words = nltk.word_tokenize(passage)
      new_words = [snowball_stemmer.stem(word) for word in words]

      new_passage = ' '.join(new_words)
      new_list_of_passages.append(new_passage)

    return new_list_of_passages

In [20]:
def refine_df_columns(list_of_titles, df):

  # the new df with only the columns to keep
  df_copy = df[list_of_titles]

  return df_copy

In [21]:
def convert_df_to_data(df):
  return df.values.tolist()

In [22]:
def create_and_train_model(x_train, y_train):

  # logistic regression with l2 regularization
  model = LogisticRegression(penalty = 'l2')

  # fitting the model
  model = model.fit(x_train, y_train)

  return model

In [23]:
def test_model(x_test, y_test, model):
  # predicted labels
  y_pred = model.predict(x_test)

  report = classification_report(y_test, y_pred, target_names = ['e', 'f'], output_dict = True)
  accuracy = round(report.get('accuracy') * 100, 2)

  return report, accuracy

## Testing

In [24]:
### adding the text of each article as a column in the dataframe

# science explainers
explainer_df = load_text_content(explainer_df, 'Science_Explainers/txt_files/')
count_text_length(explainer_df)

# fiction
uppercase_columns(fiction_df)
fiction_df = load_text_content(fiction_df, 'txt_lab_data/txt_lab_novels/')

In [25]:
### adding the tf-idf values to the dataframes

explainer_tfidf_df = add_tf_idf(explainer_df)
fiction_tfidf_df = add_tf_idf(fiction_df)

In [26]:
explainer_tfidf_df

Unnamed: 0,FILENAME,TITLE,SOURCE,DATE PUBLISHED,URL,TEXT,LENGTH,abandoned,abilities,ability,...,zone,zones,zoo,zoological,zoologist,zoology,zoom,zoos,önder,örtel
0,ATLANTIC/HowToSuccessfullySmashYourFace.txt,How to Successfully Smash Your Face Against a ...,ATLANTIC,2022-07-14,https://www.theatlantic.com/science/archive/20...,A new study refutes the widespread idea that w...,7105,0.0,0.000000,0.000000,...,0.0,0.0,0.037932,0.049662,0.0,0.0,0.0,0.0,0.0,0.0
1,ATLANTIC/WillCovidsSpringLullLast.txt,Will COVID’s Spring Lull Last?,ATLANTIC,2023-05-01,https://www.theatlantic.com/science/archive/20...,Things look calm right now. They may even stay...,8165,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,ATLANTIC/TeenBrainsArePerfectlyCapable.txt,Teen Brains Are Perfectly Capable,ATLANTIC,2023-04-30,https://www.theatlantic.com/science/archive/20...,Teenagers have plenty of cognitive control. Th...,17450,0.0,0.015754,0.052849,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,ATLANTIC/TheFishHadGillsFullOf.txt,The Fish Had Gills Full of Ash and Gas Bubblin...,ATLANTIC,2023-04-29,https://www.theatlantic.com/science/archive/20...,How volcanoes kill fish is something out of a ...,5091,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,ATLANTIC/LushPrairiesCouldReallyBeGreen.txt,Lush Prairies Could Really Be ‘Green Deserts’,ATLANTIC,2023-04-23,https://www.theatlantic.com/science/archive/20...,Climate change is stripping plants of their nu...,12049,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,REUTERS/PlasticEnteringOceansCouldNearlyTriple...,Plastic entering oceans could nearly triple by...,REUTERS,2023-03-08,https://www.reuters.com/business/environment/p...,[1/2] A plastic bottle is seen floating in an ...,3601,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
356,REUTERS/FukushimaWastewaterReleaseWouldHaveLim...,Fukushima wastewater release would have limite...,REUTERS,2023-02-16,https://www.reuters.com/world/asia-pacific/fuk...,"SEOUL, Feb 16 (Reuters) - The release of waste...",3690,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
357,REUTERS/MoreThanHalfOfTheWorlds.txt,More than half of the world's large lakes are ...,REUTERS,2023-05-18,https://www.reuters.com/business/environment/m...,"LONDON, May 18 (Reuters) - More than half of t...",4408,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
358,REUTERS/AmputeesCouldFeelWarmthOfHuman.txt,Amputees could feel warmth of human touch with...,REUTERS,2023-05-18,https://www.reuters.com/business/healthcare-ph...,"GENEVA, May 18 (Reuters) - Fabrizio Fidati, wh...",4509,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
### finding the top words of each corpora based on TF-IDF, and taking the words at their intersection

explainer_top_words = sort_top_words(explainer_tfidf_df, 'explainer').index[:1001]
fiction_top_words = sort_top_words(fiction_tfidf_df, 'fiction').index[:1001]

common_words = list(set(explainer_top_words).intersection(set(fiction_top_words)))

# dfs with only the common words as column titles
explainer_common_words_df = refine_df_columns(common_words, explainer_tfidf_df)
fiction_common_words_df = refine_df_columns(common_words, fiction_tfidf_df)

In [37]:
### POS tags

# adds POS tags to dataframes
explainer_pos_df = add_pos_tags(explainer_df)
fiction_pos_df = add_pos_tags(fiction_df)

# tag counts
explainer_tag_counts, explainer_sorted_tag_counts = count_pos_tags(explainer_pos_df)
fiction_tag_counts, fiction_sorted_tag_counts = count_pos_tags(fiction_pos_df)

# making sure we only take tags that both corpora have, though both should have nearly all
common_tags = set(explainer_sorted_tag_counts).intersection(set(fiction_sorted_tag_counts))

# dfs with only common words as column titles
explainer_common_tags_df = add_dtm(explainer_pos_df, 'POS TAGS STRING')
fiction_common_tags_df = add_dtm(fiction_pos_df, 'POS TAGS STRING')

In [106]:
### logistic regression with l2 regularization trained on word frequency

explainer_data = convert_df_to_data(explainer_common_words_df)
fiction_data = convert_df_to_data(fiction_common_words_df)

total_data = explainer_data + fiction_data
total_labels = ['e'] * len(explainer_data) + ['f'] * len(fiction_data)

x_train, x_test, y_train, y_test = train_test_split(total_data, total_labels)

model = create_and_train_model(x_train, y_train)

report, accuracy = test_model(x_test, y_test, model)