<a href="https://colab.research.google.com/github/faithrts/Science_Explainers/blob/main/science_explainer_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
### importing libraries

# basic libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

# to download files
from google.colab import files

import os
import re
import codecs

# sklearn libraries for ML
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# NLTK
import nltk
from nltk import word_tokenize
from nltk import pos_tag
from nltk.stem import SnowballStemmer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Importing databases

In [None]:
### cloning git repos

!git clone https://github.com/faithrts/Science_Explainers
#!git clone https://github.com/faithrts/Short_Fiction

Cloning into 'Short_Fiction'...
fatal: could not read Username for 'https://github.com': No such device or address


In [None]:
### saving databases into dataframes

explainer_df = pd.read_csv('Science_Explainers/science_explainers_database.csv')
fiction_df = pd.read_csv('short_fiction_database.csv')

In [None]:
### unzipping short fiction files

!unzip txt_files.zip

Archive:  txt_files.zip
   creating: txt_files/
  inflating: __MACOSX/._txt_files    
   creating: txt_files/ATLANTIC/
  inflating: __MACOSX/txt_files/._ATLANTIC  
   creating: txt_files/TIN/
  inflating: __MACOSX/txt_files/._TIN  
  inflating: txt_files/.DS_Store     
  inflating: __MACOSX/txt_files/._.DS_Store  
   creating: txt_files/LIT/
  inflating: __MACOSX/txt_files/._LIT  
   creating: txt_files/GRANTA/
  inflating: __MACOSX/txt_files/._GRANTA  
   creating: txt_files/NEW_YORKER/
  inflating: __MACOSX/txt_files/._NEW_YORKER  
   creating: txt_files/NARRATIVE/
  inflating: __MACOSX/txt_files/._NARRATIVE  
   creating: txt_files/TIN_HOUSE/
  inflating: __MACOSX/txt_files/._TIN_HOUSE  
   creating: txt_files/AGNI/
  inflating: __MACOSX/txt_files/._AGNI  
   creating: txt_files/NEW/
  inflating: __MACOSX/txt_files/._NEW  
   creating: txt_files/HARPERS/
  inflating: __MACOSX/txt_files/._HARPERS  
   creating: txt_files/.ipynb_checkpoints/
  inflating: __MACOSX/txt_files/._.ipynb_ch

## Helper functions

In [None]:
### turns all column names to upper case
def uppercase_columns(df):
  columns = df.columns
  new_columns = [column.upper() for column in columns]
  df.columns = new_columns

In [None]:
### counts the word count of the text and adds it as a column
def count_text_length(df):
  df['LENGTH'] = ''

  for index, row in df.iterrows():
    text = row['TEXT']
    text_length = len(text)
    row['LENGTH'] = text_length

In [None]:
def load_text_content(df, path):

  # adds new column to the dataframe
  df['TEXT'] = ''

  for index, row in df.iterrows():
    cur_filename = row['FILENAME']

    # renaming files with weird accent characters in their names
    if 'í' in cur_filename and os.path.isfile(path + cur_filename.replace('í', 'í')):
      os.rename(path + cur_filename.replace('í', 'í'), path + cur_filename)
    if 'é' in cur_filename and os.path.isfile(path + cur_filename.replace('é', 'é')):
      os.rename(path + cur_filename.replace('é', 'é'), path + cur_filename)

    cur_article = codecs.open(path + cur_filename, 'r', encoding = 'utf8').read()

    # saving the text in the dataframe
    df.at[index, 'TEXT'] = cur_article

  return df

In [None]:
### custom pre-processor to eliminte numbers and instances of "_", "\", and "—"
def my_preprocessor(text):
    text = text.lower()
    text = re.sub('([0-9—_\\\\])', '', text)
    return text

In [None]:
### makes all the column names UPPERCASE
def col_names_to_uppercase(df):
  new_columns = [name.upper() for name in df.columns]
  df.columns = new_columns

  return df

In [None]:
def add_dtm(df, focus_col):

  # using CountVectorizer to make a DTM based on the words in the corpus
  vectorizer = CountVectorizer(input = 'content', preprocessor = my_preprocessor, stop_words = 'english', min_df = 5, encoding = 'utf8')
  dtm = vectorizer.fit_transform(df[focus_col])
  words = vectorizer.get_feature_names_out()
  matrix = dtm.toarray()

  # combining the DTM with the metadata
  DTM = pd.DataFrame(matrix, columns = words)

  # attaching the DTM to the original dataframe
  dtm_both = pd.concat([df, DTM], axis=1)

  return dtm_both

In [None]:
def add_tf_idf(df, focus_col, keep_punc = False):

  # using TfidfVectorizer to add the tf-idf values of each word to the dataframe
  if keep_punc:
    vectorizer = TfidfVectorizer(input = 'content', stop_words = 'english', min_df = 5, encoding = 'utf8')
  else:
    vectorizer = TfidfVectorizer(input = 'content', preprocessor = my_preprocessor, stop_words = 'english', min_df = 5, encoding = 'utf8')

  tf_idf = vectorizer.fit_transform(df[focus_col])
  words = vectorizer.get_feature_names_out()

  # converting sparse matrix to an array of arrays
  matrix = tf_idf.toarray()

  # combining the tf-idf matrix with the metadata (associated words)
  TF_IDF = pd.DataFrame(matrix, columns = words)

  # attaches the tf-idf to the original dataframe
  tf_idf_both = pd.concat([df, TF_IDF], axis = 1)

  return tf_idf_both

In [None]:
### assumes the POS tags are in a column called 'POS TAGS'
def count_pos_tags(df):
  # concatenates all lists of POS tags into one big lists
  all_tags = df['POS TAGS'].sum()

  # counts each POS tag occurrence
  tag_counts = Counter(all_tags)

  # sorts the POS tags
  sorted_tag_counts = sorted(tag_counts, reverse = True)

  return tag_counts, sorted_tag_counts

In [None]:
### assumes the text content is in a column called 'TEXT'
def add_pos_tags(df):

  new_df = df.copy()

  new_df['POS TAG TOKENS'] = ''
  new_df['POS TAGS'] = ''
  new_df['POS TAGS STRING'] = ''

  for index, row in new_df.iterrows():
    cur_text = row['TEXT']
    tokenized_text = word_tokenize(cur_text)
    POS_tags = pos_tag(tokenized_text)
    tags_only = [tag for word,tag in POS_tags]

    row['POS TAG TOKENS'] = POS_tags
    row['POS TAGS'] = tags_only
    row['POS TAGS STRING'] = ' '.join(tags_only)

  return new_df

In [None]:
### assumes the matrix starts after the 5th column
def sort_top_words(matrix_df, df_type):
  # isolates for the matrix
  if df_type == 'explainer':
    matrix = matrix_df.iloc[:, 7:]
  elif df_type == 'fiction':
    matrix = matrix_df.iloc[:, 10:]
  else:
    return

  # sums the values, result is a Pandas series
  sum_values = matrix.sum()

  # divides the sums by the number of words
  sum_values = sum_values/len(sum_values)

  return sum_values.sort_values(ascending = False)

In [None]:
class StemWords(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

  def fit(self, X, y=None):
    return self

  def transform(self, list_of_passages):
    # initializes the stemmer
    snowball_stemmer = SnowballStemmer('english')
    new_list_of_passages = []

    for passage in list_of_passages:
      # breaks the passage up into its component words
      words = nltk.word_tokenize(passage)
      new_words = [snowball_stemmer.stem(word) for word in words]

      new_passage = ' '.join(new_words)
      new_list_of_passages.append(new_passage)

    return new_list_of_passages

In [None]:
def refine_df_columns(list_of_titles, df):

  # the new df with only the columns to keep
  df_copy = df[list_of_titles]

  return df_copy

In [None]:
def convert_df_to_data(df):
  return df.values.tolist()

In [None]:
def create_and_train_model(x_train, y_train):

  # logistic regression with l2 regularization
  model = LogisticRegression(penalty = 'l2')

  # fitting the model
  model = model.fit(x_train, y_train)

  return model

In [None]:
def test_model(x_test, y_test, model):
  # predicted labels
  y_pred = model.predict(x_test)

  report = classification_report(y_test, y_pred, target_names = ['e', 'f'], output_dict = True)
  accuracy = round(report.get('accuracy') * 100, 2)

  return report, accuracy

# Tests

In [None]:
### adding the text of each article as a column in the dataframe

# science explainers
explainer_df = load_text_content(explainer_df, 'Science_Explainers/txt_files/')
count_text_length(explainer_df)

# fiction
fiction_df = load_text_content(fiction_df, 'txt_files/')
count_text_length(fiction_df)

## Vocabulary test

In [None]:
### adding the tf-idf values to the dataframes

explainer_tfidf_df = add_tf_idf(explainer_df, 'TEXT')
fiction_tfidf_df = add_tf_idf(fiction_df, 'TEXT')

In [None]:
explainer_tfidf_df

Unnamed: 0,FILENAME,TITLE,SOURCE,DATE PUBLISHED,URL,TEXT,LENGTH,aaron,abandon,abandoned,...,zoologist,zoology,zoom,zooplankton,zoos,zulian,zurich,zwicky,önder,örtel
0,ATLANTIC/HowToSuccessfullySmashYourFace.txt,How to Successfully Smash Your Face Against a ...,ATLANTIC,2022-07-14,https://www.theatlantic.com/science/archive/20...,A new study refutes the widespread idea that w...,7105,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ATLANTIC/WillCovidsSpringLullLast.txt,Will COVID’s Spring Lull Last?,ATLANTIC,2023-05-01,https://www.theatlantic.com/science/archive/20...,Things look calm right now. They may even stay...,8165,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ATLANTIC/TeenBrainsArePerfectlyCapable.txt,Teen Brains Are Perfectly Capable,ATLANTIC,2023-04-30,https://www.theatlantic.com/science/archive/20...,Teenagers have plenty of cognitive control. Th...,17450,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ATLANTIC/TheFishHadGillsFullOf.txt,The Fish Had Gills Full of Ash and Gas Bubblin...,ATLANTIC,2023-04-29,https://www.theatlantic.com/science/archive/20...,How volcanoes kill fish is something out of a ...,5091,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ATLANTIC/LushPrairiesCouldReallyBeGreen.txt,Lush Prairies Could Really Be ‘Green Deserts’,ATLANTIC,2023-04-23,https://www.theatlantic.com/science/archive/20...,Climate change is stripping plants of their nu...,12049,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,REUTERS/RightAgainEinsteinStudyShowsHow.txt,"Right again, Einstein! Study shows how antimat...",REUTERS,2023-09-27,https://www.reuters.com/science/right-again-ei...,An artist's conceptual rendering of antihydrog...,6261,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
896,REUTERS/NasaAsteroidSampleParachutesSafelyOnto...,NASA asteroid sample parachutes safely onto Ut...,REUTERS,2023-09-24,https://www.reuters.com/science/nasas-first-as...,Sept 24 (Reuters) - A NASA space capsule carry...,6277,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
897,REUTERS/MexicoResearchersShowProgressOnDrive.txt,Mexico researchers show progress on drive to r...,REUTERS,2023-09-22,https://www.reuters.com/markets/commodities/me...,[1/6]Corn cobs are pictured in a corn field at...,4909,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
898,REUTERS/ZambiaFindShowsHumansHaveBuilt.txt,Zambia find shows humans have built with wood ...,REUTERS,2023-09-20,https://www.reuters.com/science/zambia-find-sh...,[1/4]Researchers uncover wooden artefacts on t...,5854,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
fiction_tfidf_df

Unnamed: 0,FILENAME,TITLE,SOURCE,DATE PUBLISHED,URL,TEXT,LENGTH,aa,aaron,aback,...,zipping,zombie,zone,zones,zoning,zoo,zoom,zoomed,zooming,zucchini
0,ATLANTIC/SheWhoRemembers.txt,She Who Remembers,ATLANTIC,2023-10-16,https://www.theatlantic.com/magazine/archive/2...,A short story\nThe Georgia men wake everyone i...,19220,0.0,0.0,0.0000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ATLANTIC/TheComebacker.txt,The Comebacker,ATLANTIC,2023-08-12,https://www.theatlantic.com/magazine/archive/2...,"A short story\nThe day was cold, cold even for...",35629,0.0,0.0,0.0000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ATLANTIC/ThePosting.txt,The Posting,ATLANTIC,2023-06-27,https://www.theatlantic.com/books/archive/2023...,A short story\nEverything overheard in those d...,53978,0.0,0.0,0.0179,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ATLANTIC/LatenightradioTalkshowHostTellsAll.txt,Late-Night-Radio Talk-Show Host Tells All,ATLANTIC,2023-05-29,https://www.theatlantic.com/books/archive/2023...,A short story\nDo I have rivals? Competitors? ...,12021,0.0,0.0,0.0000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ATLANTIC/TheRenovation.txt,The Renovation,ATLANTIC,2023-04-27,https://www.theatlantic.com/books/archive/2023...,A short story\nI didn’t know by what accident ...,39933,0.0,0.0,0.0000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,THE_SUN/WhenTheyCameToUs.txt,WhenTheyCameToUs,THE_SUN,2016-08-01,https://www.thesunmagazine.org/issues/488/when...,They Arrive On A Warm Summer Night With No Bre...,33019,0.0,0.0,0.0000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
896,THE_SUN/TheUnifiedConspiracyTheory.txt,TheUnifiedConspiracyTheory,THE_SUN,2016-07-01,https://www.thesunmagazine.org/issues/487/the-...,"When Jack walked into the Nite Owl Diner, I al...",30691,0.0,0.0,0.0000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
897,THE_SUN/TheyWere.txt,TheyWere,THE_SUN,2016-07-01,https://www.thesunmagazine.org/issues/487/they...,He was. She was. They met and together they: d...,3048,0.0,0.0,0.0000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
898,THE_SUN/DaysOfHumanSacrifice.txt,DaysOfHumanSacrifice,THE_SUN,2016-06-01,https://www.thesunmagazine.org/issues/486/days...,"When I was in fifth grade, my parents attached...",23536,0.0,0.0,0.0000,...,0.0,0.074227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
### finding the top words of each corpora based on TF-IDF, and taking the words at their intersection

explainer_top_words = sort_top_words(explainer_tfidf_df, 'explainer').index[:1001]
fiction_top_words = sort_top_words(fiction_tfidf_df, 'fiction').index[:1001]

# TO DO: RE-SORT
common_words = list(set(explainer_top_words).intersection(set(fiction_top_words)))

# dfs with only the common words as column titles
explainer_common_words_df = refine_df_columns(common_words, explainer_tfidf_df)
fiction_common_words_df = refine_df_columns(common_words, fiction_tfidf_df)

In [None]:
### logistic regression with l2 regularization trained on word frequency

explainer_data = convert_df_to_data(explainer_common_words_df)
fiction_data = convert_df_to_data(fiction_common_words_df)

total_data = explainer_data + fiction_data
total_labels = ['e'] * len(explainer_data) + ['f'] * len(fiction_data)

x_train, x_test, y_train, y_test = train_test_split(total_data, total_labels)

model = create_and_train_model(x_train, y_train)

report, accuracy = test_model(x_test, y_test, model)

In [None]:
report

{'e': {'precision': 0.835820895522388,
  'recall': 0.9739130434782609,
  'f1-score': 0.8995983935742972,
  'support': 230},
 'f': {'precision': 0.967032967032967,
  'recall': 0.8,
  'f1-score': 0.8756218905472637,
  'support': 220},
 'accuracy': 0.8888888888888888,
 'macro avg': {'precision': 0.9014269312776775,
  'recall': 0.8869565217391304,
  'f1-score': 0.8876101420607805,
  'support': 450},
 'weighted avg': {'precision': 0.8999690193720045,
  'recall': 0.8888888888888888,
  'f1-score': 0.8878765476499696,
  'support': 450}}

In [None]:
print('The model was ' + str(accuracy) + '% accurate at distinguishing literary fiction from science explainers')

The model was 88.89% accurate at distinguishing literary fiction from science explainers


## Part-of-Speech (POS) test

In [None]:
### POS tags

# adds POS tags to dataframes
explainer_pos_df = add_pos_tags(explainer_df)
fiction_pos_df = add_pos_tags(fiction_df)

In [None]:
### counting each tag occurrence per article
explainer_pos_df = add_tf_idf(explainer_pos_df, 'POS TAGS STRING', keep_punc = True)
fiction_pos_df = add_tf_idf(fiction_pos_df, 'POS TAGS STRING', keep_punc = True)

# sets all column titles (POS tags) to uppercase
explainer_pos_df = col_names_to_uppercase(explainer_pos_df)
fiction_pos_df = col_names_to_uppercase(fiction_pos_df)

In [None]:
explainer_pos_df.columns

Index(['FILENAME', 'TITLE', 'SOURCE', 'DATE PUBLISHED', 'URL', 'TEXT',
       'LENGTH', 'POS TAG TOKENS', 'POS TAGS', 'POS TAGS STRING', 'CC', 'CD',
       'DT', 'EX', 'FW', 'JJ', 'JJR', 'JJS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS',
       'PDT', 'POS', 'PRP', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'UH', 'VB', 'VBD',
       'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WRB'],
      dtype='object')

In [None]:
explainer_pos_df

Unnamed: 0,FILENAME,TITLE,SOURCE,DATE PUBLISHED,URL,TEXT,LENGTH,POS TAG TOKENS,POS TAGS,POS TAGS STRING,...,UH,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WRB
0,ATLANTIC/HowToSuccessfullySmashYourFace.txt,How to Successfully Smash Your Face Against a ...,ATLANTIC,2022-07-14,https://www.theatlantic.com/science/archive/20...,A new study refutes the widespread idea that w...,7105,"[(A, DT), (new, JJ), (study, NN), (refutes, VB...","[DT, JJ, NN, VBZ, DT, JJ, NN, IN, NNS, VBP, JJ...",DT JJ NN VBZ DT JJ NN IN NNS VBP JJ NNS . IN P...,...,0.000000,0.178939,0.115032,0.063907,0.092665,0.189154,0.118228,0.051524,0.028190,0.043716
1,ATLANTIC/WillCovidsSpringLullLast.txt,Will COVID’s Spring Lull Last?,ATLANTIC,2023-05-01,https://www.theatlantic.com/science/archive/20...,Things look calm right now. They may even stay...,8165,"[(Things, NNS), (look, VBP), (calm, JJ), (righ...","[NNS, VBP, JJ, RB, RB, ., PRP, MD, RB, VB, IN,...",NNS VBP JJ RB RB . PRP MD RB VB IN NN PRP VBD ...,...,0.000000,0.156607,0.089490,0.075507,0.081100,0.134683,0.125845,0.028184,0.009252,0.017659
2,ATLANTIC/TeenBrainsArePerfectlyCapable.txt,Teen Brains Are Perfectly Capable,ATLANTIC,2023-04-30,https://www.theatlantic.com/science/archive/20...,Teenagers have plenty of cognitive control. Th...,17450,"[(Teenagers, NNS), (have, VBP), (plenty, NN), ...","[NNS, VBP, NN, IN, JJ, NN, ., PRP, RB, VBP, JJ...",NNS VBP NN IN JJ NN . PRP RB VBP JJ NN RB VBP ...,...,0.016278,0.184493,0.057256,0.108151,0.053439,0.228513,0.153956,0.043598,0.040691,0.056240
3,ATLANTIC/TheFishHadGillsFullOf.txt,The Fish Had Gills Full of Ash and Gas Bubblin...,ATLANTIC,2023-04-29,https://www.theatlantic.com/science/archive/20...,How volcanoes kill fish is something out of a ...,5091,"[(How, WRB), (volcanoes, NNS), (kill, VB), (fi...","[WRB, NNS, VB, JJ, VBZ, NN, IN, IN, DT, NN, NN...",WRB NNS VB JJ VBZ NN IN IN DT NN NN DT NN VBD ...,...,0.000000,0.103028,0.197096,0.049274,0.103028,0.112360,0.098548,0.004514,0.019759,0.037714
4,ATLANTIC/LushPrairiesCouldReallyBeGreen.txt,Lush Prairies Could Really Be ‘Green Deserts’,ATLANTIC,2023-04-23,https://www.theatlantic.com/science/archive/20...,Climate change is stripping plants of their nu...,12049,"[(Climate, NNP), (change, NN), (is, VBZ), (str...","[NNP, NN, VBZ, VBG, NNS, IN, PRP$, NNS, ., DT,...",NNP NN VBZ VBG NNS IN PRP$ NNS . DT MD VB DT N...,...,0.006365,0.087562,0.095522,0.069651,0.089552,0.139767,0.125372,0.028078,0.024140,0.020943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,REUTERS/RightAgainEinsteinStudyShowsHow.txt,"Right again, Einstein! Study shows how antimat...",REUTERS,2023-09-27,https://www.reuters.com/science/right-again-ei...,An artist's conceptual rendering of antihydrog...,6261,"[(An, DT), (artist, NN), ('s, POS), (conceptua...","[DT, NN, POS, JJ, NN, IN, NN, NNS, VBG, RP, DT...",DT NN POS JJ NN IN NN NNS VBG RP DT NN IN DT J...,...,0.000000,0.113181,0.113181,0.054639,0.117084,0.058737,0.081959,0.035399,0.012912,0.004107
896,REUTERS/NasaAsteroidSampleParachutesSafelyOnto...,NASA asteroid sample parachutes safely onto Ut...,REUTERS,2023-09-24,https://www.reuters.com/science/nasas-first-as...,Sept 24 (Reuters) - A NASA space capsule carry...,6277,"[(Sept, $), (24, CD), ((, (), (Reuters, NNPS),...","[$, CD, (, NNPS, ), :, DT, NNP, NN, NN, VBG, D...",$ CD ( NNPS ) : DT NNP NN NN VBG DT JJS NN NN ...,...,0.000000,0.059013,0.125402,0.084831,0.106961,0.011102,0.047948,0.014868,0.004067,0.003882
897,REUTERS/MexicoResearchersShowProgressOnDrive.txt,Mexico researchers show progress on drive to r...,REUTERS,2023-09-22,https://www.reuters.com/markets/commodities/me...,[1/6]Corn cobs are pictured in a corn field at...,4909,"[([, RB), (1/6, CD), (], JJ), (Corn, NNP), (co...","[RB, CD, JJ, NNP, NN, VBP, VBN, IN, DT, NN, NN...",RB CD JJ NNP NN VBP VBN IN DT NN NN IN NNP NNP...,...,0.000000,0.116111,0.088245,0.078956,0.069667,0.032620,0.116111,0.009361,0.005122,0.004888
898,REUTERS/ZambiaFindShowsHumansHaveBuilt.txt,Zambia find shows humans have built with wood ...,REUTERS,2023-09-20,https://www.reuters.com/science/zambia-find-sh...,[1/4]Researchers uncover wooden artefacts on t...,5854,"[([, RB), (1/4, CD), (], JJ), (Researchers, NN...","[RB, CD, JJ, NNPS, RB, JJ, NNS, IN, DT, NNS, I...",RB CD JJ NNPS RB JJ NNS IN DT NNS IN DT NNP NN...,...,0.000000,0.068576,0.153287,0.096813,0.141186,0.036426,0.080677,0.024392,0.000000,0.008491


In [None]:
fiction_pos_df

Unnamed: 0,FILENAME,TITLE,SOURCE,DATE PUBLISHED,URL,TEXT,LENGTH,POS TAG TOKENS,POS TAGS,POS TAGS STRING,...,UH,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WRB
0,ATLANTIC/SheWhoRemembers.txt,She Who Remembers,ATLANTIC,2023-10-16,https://www.theatlantic.com/magazine/archive/2...,A short story\nThe Georgia men wake everyone i...,19220,"[(A, DT), (short, JJ), (story, NN), (The, DT),...","[DT, JJ, NN, DT, NNP, NNS, VBP, NN, IN, DT, JJ...",DT JJ NN DT NNP NNS VBP NN IN DT JJ NN . DT NN...,...,0.004069,0.106524,0.066344,0.058321,0.049634,0.142823,0.171970,0.007094,0.015647,0.024951
1,ATLANTIC/TheComebacker.txt,The Comebacker,ATLANTIC,2023-08-12,https://www.theatlantic.com/magazine/archive/2...,"A short story\nThe day was cold, cold even for...",35629,"[(A, DT), (short, JJ), (story, NN), (The, DT),...","[DT, JJ, NN, DT, NN, VBD, JJ, ,, JJ, RB, IN, N...","DT JJ NN DT NN VBD JJ , JJ RB IN NNP IN NNP NN...",...,0.002246,0.122258,0.386375,0.059200,0.057387,0.072103,0.051403,0.011748,0.012957,0.028608
2,ATLANTIC/ThePosting.txt,The Posting,ATLANTIC,2023-06-27,https://www.theatlantic.com/books/archive/2023...,A short story\nEverything overheard in those d...,53978,"[(A, DT), (short, JJ), (story, NN), (Everythin...","[DT, JJ, NN, NNP, NN, IN, DT, NNS, ., JJ, IN, ...","DT JJ NN NNP NN IN DT NNS . JJ IN DT NNS , PRP...",...,0.002060,0.097209,0.310787,0.088930,0.085336,0.058151,0.015512,0.009619,0.018194,0.033521
3,ATLANTIC/LatenightradioTalkshowHostTellsAll.txt,Late-Night-Radio Talk-Show Host Tells All,ATLANTIC,2023-05-29,https://www.theatlantic.com/books/archive/2023...,A short story\nDo I have rivals? Competitors? ...,12021,"[(A, DT), (short, JJ), (story, NN), (Do, NNP),...","[DT, JJ, NN, NNP, PRP, VBP, NNS, ., NNS, ., RB...",DT JJ NN NNP PRP VBP NNS . NNS . RB : DT NNS N...,...,0.000000,0.118601,0.220259,0.040315,0.104972,0.127006,0.056381,0.015034,0.025792,0.018982
4,ATLANTIC/TheRenovation.txt,The Renovation,ATLANTIC,2023-04-27,https://www.theatlantic.com/books/archive/2023...,A short story\nI didn’t know by what accident ...,39933,"[(A, DT), (short, JJ), (story, NN), (I, PRP), ...","[DT, JJ, NN, PRP, VBP, JJ, NN, VBN, IN, WP, VB...",DT JJ NN PRP VBP JJ NN VBN IN WP VBD DT NNS VB...,...,0.006267,0.144389,0.275826,0.080645,0.088461,0.095509,0.060481,0.015607,0.021086,0.026603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,THE_SUN/WhenTheyCameToUs.txt,WhenTheyCameToUs,THE_SUN,2016-08-01,https://www.thesunmagazine.org/issues/488/when...,They Arrive On A Warm Summer Night With No Bre...,33019,"[(They, PRP), (Arrive, VBP), (On, IN), (A, NNP...","[PRP, VBP, IN, NNP, NNP, NNP, NNP, IN, NNP, NN...",PRP VBP IN NNP NNP NNP NNP IN NNP NNP PRP VBD ...,...,0.002652,0.130960,0.356334,0.076035,0.061047,0.108414,0.039638,0.011230,0.032512,0.035032
896,THE_SUN/TheUnifiedConspiracyTheory.txt,TheUnifiedConspiracyTheory,THE_SUN,2016-07-01,https://www.thesunmagazine.org/issues/487/the-...,"When Jack walked into the Nite Owl Diner, I al...",30691,"[(When, WRB), (Jack, NNP), (walked, VBD), (int...","[WRB, NNP, VBD, IN, DT, NNP, NNP, NNP, ,, PRP,...","WRB NNP VBD IN DT NNP NNP NNP , PRP RB VBP JJ ...",...,0.000851,0.187005,0.370492,0.090291,0.076966,0.091370,0.038148,0.015258,0.020860,0.031909
897,THE_SUN/TheyWere.txt,TheyWere,THE_SUN,2016-07-01,https://www.thesunmagazine.org/issues/487/they...,He was. She was. They met and together they: d...,3048,"[(He, PRP), (was, VBD), (., .), (She, PRP), (w...","[PRP, VBD, ., PRP, VBD, ., PRP, VBD, CC, RB, P...",PRP VBD . PRP VBD . PRP VBD CC RB PRP : NN NN ...,...,0.000000,0.026087,0.515227,0.026262,0.137264,0.019674,0.019894,0.000000,0.013651,0.020094
898,THE_SUN/DaysOfHumanSacrifice.txt,DaysOfHumanSacrifice,THE_SUN,2016-06-01,https://www.thesunmagazine.org/issues/486/days...,"When I was in fifth grade, my parents attached...",23536,"[(When, WRB), (I, PRP), (was, VBD), (in, IN), ...","[WRB, PRP, VBD, IN, JJ, NN, ,, PRP$, NNS, VBD,...","WRB PRP VBD IN JJ NN , PRP$ NNS VBD DT NN NN I...",...,0.005954,0.137823,0.368349,0.061939,0.068243,0.067645,0.027527,0.012456,0.026616,0.026118


In [None]:
### tag counts
explainer_tag_counts, explainer_sorted_tag_counts = count_pos_tags(explainer_pos_df)
fiction_tag_counts, fiction_sorted_tag_counts = count_pos_tags(fiction_pos_df)

In [None]:
explainer_tag_counts

Counter({'DT': 114534,
         'JJ': 103887,
         'NN': 191954,
         'VBZ': 34136,
         'IN': 149744,
         'NNS': 98666,
         'VBP': 30750,
         '.': 51760,
         'PRP$': 12641,
         'VBG': 27742,
         ',': 70190,
         'PRP': 33910,
         'WDT': 8568,
         'MD': 14130,
         'VB': 42816,
         'CD': 21258,
         'TO': 30280,
         'RB': 49274,
         'CC': 39998,
         ':': 4335,
         'WRB': 6444,
         'VBD': 34567,
         'NNP': 106722,
         'VBN': 27744,
         'RP': 4142,
         'EX': 2179,
         'JJR': 6230,
         'WP': 4463,
         '(': 2612,
         ')': 2626,
         'RBR': 3169,
         'JJS': 3054,
         'RBS': 1009,
         'NNPS': 2039,
         'PDT': 718,
         'FW': 1021,
         'WP$': 154,
         'UH': 154,
         "''": 3848,
         'POS': 2852,
         '$': 377,
         '``': 3594,
         '#': 52,
         'SYM': 9})

In [None]:
'''
### tag counts
explainer_tag_counts, explainer_sorted_tag_counts = count_pos_tags(explainer_pos_df)
fiction_tag_counts, fiction_sorted_tag_counts = count_pos_tags(fiction_pos_df)

# making sure we only take tags that both corpora have, though both should have nearly all
common_tags = list(set(explainer_sorted_tag_counts).intersection(set(fiction_sorted_tag_counts)))

# dfs with only common words as column titles
explainer_common_tags_df = refine_df_columns(explainer_pos_df, common_tags)
fiction_common_tags_df = refine_df_columns(fiction_pos_df, common_tags)
'''

'\n### tag counts\nexplainer_tag_counts, explainer_sorted_tag_counts = count_pos_tags(explainer_pos_df)\nfiction_tag_counts, fiction_sorted_tag_counts = count_pos_tags(fiction_pos_df)\n\n# making sure we only take tags that both corpora have, though both should have nearly all\ncommon_tags = list(set(explainer_sorted_tag_counts).intersection(set(fiction_sorted_tag_counts)))\n\n# dfs with only common words as column titles\nexplainer_common_tags_df = refine_df_columns(explainer_pos_df, common_tags)\nfiction_common_tags_df = refine_df_columns(fiction_pos_df, common_tags)\n'

In [None]:
### finding the common POS tags and refining the dataframes

# common tags
explainer_pos_tags = explainer_pos_df.columns[10:]
fiction_pos_tags = fiction_pos_df.columns[10:]
common_tags = list(set(explainer_pos_tags).intersection(set(fiction_pos_tags)))

# dfs with only common words as column titles
explainer_common_tags_df = refine_df_columns(common_tags, explainer_pos_df)
fiction_common_tags_df = refine_df_columns(common_tags, fiction_pos_df)

In [None]:
explainer_common_tags_df

Unnamed: 0,PRP,VBN,DT,NN,RP,EX,NNPS,RB,UH,WDT,...,RBR,JJS,MD,NNS,WRB,VBD,VBZ,FW,CD,JJ
0,0.233260,0.092665,0.383442,0.616702,0.007321,0.008124,0.000000,0.182135,0.000000,0.051524,...,0.003739,0.003571,0.054623,0.354684,0.043716,0.115032,0.118228,0.000000,0.041632,0.306753
1,0.139828,0.081100,0.391518,0.562109,0.016017,0.007110,0.017593,0.282453,0.000000,0.028184,...,0.016362,0.021879,0.073115,0.276859,0.017659,0.089490,0.125845,0.006006,0.028028,0.444653
2,0.279920,0.053439,0.328270,0.643816,0.010202,0.025878,0.000000,0.202306,0.016278,0.043598,...,0.019356,0.001422,0.046060,0.255745,0.056240,0.057256,0.153956,0.005465,0.026779,0.358807
3,0.152302,0.103028,0.470344,0.492741,0.030787,0.017082,0.000000,0.161261,0.000000,0.004514,...,0.005242,0.000000,0.027026,0.389713,0.037714,0.197096,0.098548,0.000000,0.053873,0.367316
4,0.113432,0.089552,0.304476,0.630842,0.018237,0.015178,0.002504,0.131342,0.006365,0.028078,...,0.016301,0.004448,0.036020,0.421888,0.020943,0.095522,0.125372,0.000000,0.081773,0.376117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,0.074153,0.117084,0.355154,0.690794,0.013412,0.009922,0.034372,0.120987,0.000000,0.035399,...,0.000000,0.017448,0.035321,0.269293,0.004107,0.113181,0.081959,0.000000,0.027380,0.355154
896,0.077454,0.106961,0.405713,0.601192,0.012675,0.000000,0.023202,0.132779,0.000000,0.014868,...,0.008632,0.016489,0.011126,0.280310,0.003882,0.125402,0.047948,0.000000,0.162646,0.346700
897,0.092889,0.069667,0.278667,0.654868,0.010640,0.000000,0.035061,0.111467,0.000000,0.009361,...,0.000000,0.020763,0.009341,0.297245,0.004888,0.088245,0.116111,0.000000,0.097750,0.311178
898,0.064542,0.141186,0.431624,0.621216,0.000000,0.000000,0.025376,0.121016,0.000000,0.024392,...,0.009441,0.013525,0.012169,0.354981,0.008491,0.153287,0.080677,0.000000,0.097028,0.326744


In [None]:
### logistic regression with l2 regularization trained tag counts

explainer_data = convert_df_to_data(explainer_common_tags_df)
fiction_data = convert_df_to_data(fiction_common_tags_df)

total_data = explainer_data + fiction_data
total_labels = ['e'] * len(explainer_data) + ['f'] * len(fiction_data)

x_train, x_test, y_train, y_test = train_test_split(total_data, total_labels)

model = create_and_train_model(x_train, y_train)

report, accuracy = test_model(x_test, y_test, model)

In [None]:
report

{'e': {'precision': 0.947136563876652,
  'recall': 0.9862385321100917,
  'f1-score': 0.9662921348314607,
  'support': 218},
 'f': {'precision': 0.9865470852017937,
  'recall': 0.9482758620689655,
  'f1-score': 0.967032967032967,
  'support': 232},
 'accuracy': 0.9666666666666667,
 'macro avg': {'precision': 0.9668418245392228,
  'recall': 0.9672571970895286,
  'f1-score': 0.9666625509322139,
  'support': 450},
 'weighted avg': {'precision': 0.9674548770931696,
  'recall': 0.9666666666666667,
  'f1-score': 0.9666740749886817,
  'support': 450}}

In [None]:
print('The model was ' + str(accuracy) + '% accurate at distinguishing literary fiction from science explainers')

The model was 96.67% accurate at distinguishing literary fiction from science explainers


# Testing

In [None]:
print('The top 20 words used in science explainers: ')
print(explainer_top_words[0:20].tolist())

The top 20 words used in science explainers: 
['said', 'says', 'study', 'species', 'people', 'climate', 'university', 'reuters', 'like', 'scientists', 'new', 'water', 'canada', 'researchers', 'health', 'years', 'cbc', 'research', 'science', 'news']


In [None]:
print('The top 20 words used in short fiction: ')
print(fiction_top_words[0:20].tolist())

The top 20 words used in short fiction: 
['said', 'like', 'just', 'mother', 'time', 'know', 'didn', 'says', 'don', 'father', 'did', 'man', 'say', 'way', 'people', 'thought', 'day', 'house', 'little', 'think']
