<a href="https://colab.research.google.com/github/faithrts/Science_Explainers/blob/main/science_explainer_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [7]:
### importing libraries

# basic libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

# to download files
from google.colab import files

import os
import re
import math
import codecs

# for stats
from scipy.stats import chi2

# sklearn libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# NLTK
import nltk
from nltk import word_tokenize
from nltk import pos_tag
from nltk.stem import SnowballStemmer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Importing databases

In [8]:
### cloning git repos

!git clone https://github.com/faithrts/Science_Explainers
!git clone https://github.com/dhmit/gender_novels
#!git clone https://github.com/faithrts/Short_Fiction

fatal: destination path 'Science_Explainers' already exists and is not an empty directory.


In [9]:
### saving databases into dataframes

explainer_df = pd.read_csv('Science_Explainers/science_explainers_database.csv')
fiction_df = pd.read_csv('short_fiction_database.csv')

In [10]:
### unzipping short fiction files

!unzip txt_files.zip

Archive:  txt_files.zip
   creating: txt_files/
  inflating: __MACOSX/._txt_files    
   creating: txt_files/ATLANTIC/
  inflating: __MACOSX/txt_files/._ATLANTIC  
   creating: txt_files/TIN/
  inflating: __MACOSX/txt_files/._TIN  
  inflating: txt_files/.DS_Store     
  inflating: __MACOSX/txt_files/._.DS_Store  
   creating: txt_files/LIT/
  inflating: __MACOSX/txt_files/._LIT  
   creating: txt_files/GRANTA/
  inflating: __MACOSX/txt_files/._GRANTA  
   creating: txt_files/NEW_YORKER/
  inflating: __MACOSX/txt_files/._NEW_YORKER  
   creating: txt_files/NARRATIVE/
  inflating: __MACOSX/txt_files/._NARRATIVE  
   creating: txt_files/TIN_HOUSE/
  inflating: __MACOSX/txt_files/._TIN_HOUSE  
   creating: txt_files/AGNI/
  inflating: __MACOSX/txt_files/._AGNI  
   creating: txt_files/NEW/
  inflating: __MACOSX/txt_files/._NEW  
   creating: txt_files/HARPERS/
  inflating: __MACOSX/txt_files/._HARPERS  
   creating: txt_files/.ipynb_checkpoints/
  inflating: __MACOSX/txt_files/._.ipynb_ch

## Helper functions

### Data processing

In [11]:
### turns all column names to upper case
def uppercase_columns(df):
  columns = df.columns
  new_columns = [column.upper() for column in columns]
  df.columns = new_columns

In [12]:
### counts the word count of the text and adds it as a column
def count_text_length(df):
  df['LENGTH'] = ''

  for index, row in df.iterrows():
    text = row['TEXT']
    text_length = len(text)
    row['LENGTH'] = text_length

In [13]:
def load_text_content(df, path):

  # adds new column to the dataframe
  df['TEXT'] = ''

  for index, row in df.iterrows():
    cur_filename = row['FILENAME']

    # renaming files with weird accent characters in their names
    if 'í' in cur_filename and os.path.isfile(path + cur_filename.replace('í', 'í')):
      os.rename(path + cur_filename.replace('í', 'í'), path + cur_filename)
    if 'é' in cur_filename and os.path.isfile(path + cur_filename.replace('é', 'é')):
      os.rename(path + cur_filename.replace('é', 'é'), path + cur_filename)

    cur_article = codecs.open(path + cur_filename, 'r', encoding = 'utf8').read()

    # saving the text in the dataframe
    df.at[index, 'TEXT'] = cur_article

  return df

In [14]:
### custom pre-processor to eliminte numbers and instances of "_", "\", and "—"
def my_preprocessor(text):
    text = text.lower()
    text = re.sub('([0-9—_\\\\])', '', text)
    return text

In [15]:
### makes all the column names UPPERCASE
def col_names_to_uppercase(df):
  new_columns = [name.upper() for name in df.columns]
  df.columns = new_columns

  return df

In [21]:
class StemWords(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

  def fit(self, X, y=None):
    return self

  def transform(self, list_of_passages):
    # initializes the stemmer
    snowball_stemmer = SnowballStemmer('english')
    new_list_of_passages = []

    for passage in list_of_passages:
      # breaks the passage up into its component words
      words = nltk.word_tokenize(passage)
      new_words = [snowball_stemmer.stem(word) for word in words]

      new_passage = ' '.join(new_words)
      new_list_of_passages.append(new_passage)

    return new_list_of_passages

In [22]:
def refine_df_columns(list_of_titles, df):

  # the new df with only the columns to keep
  df_copy = df[list_of_titles]

  return df_copy

### Adding features to dataframes

In [16]:
def add_dtm(df, focus_col):

  # using CountVectorizer to make a DTM based on the words in the corpus
  vectorizer = CountVectorizer(input = 'content', preprocessor = my_preprocessor, stop_words = 'english', min_df = 5, encoding = 'utf8')
  dtm = vectorizer.fit_transform(df[focus_col])
  words = vectorizer.get_feature_names_out()
  matrix = dtm.toarray()

  # combining the DTM with the metadata
  DTM = pd.DataFrame(matrix, columns = words)

  # attaching the DTM to the original dataframe
  dtm_both = pd.concat([df, DTM], axis=1)

  return dtm_both

In [17]:
def add_tf_idf(df, focus_col, keep_punc = False):

  # using TfidfVectorizer to add the tf-idf values of each word to the dataframe
  if keep_punc:
    vectorizer = TfidfVectorizer(input = 'content', stop_words = 'english', min_df = 5, encoding = 'utf8')
  else:
    vectorizer = TfidfVectorizer(input = 'content', preprocessor = my_preprocessor, stop_words = 'english', min_df = 5, encoding = 'utf8')

  tf_idf = vectorizer.fit_transform(df[focus_col])
  words = vectorizer.get_feature_names_out()

  # converting sparse matrix to an array of arrays
  matrix = tf_idf.toarray()

  # combining the tf-idf matrix with the metadata (associated words)
  TF_IDF = pd.DataFrame(matrix, columns = words)

  # attaches the tf-idf to the original dataframe
  tf_idf_both = pd.concat([df, TF_IDF], axis = 1)

  return tf_idf_both

In [18]:
### assumes the POS tags are in a column called 'POS TAGS'
def count_pos_tags(df):
  # concatenates all lists of POS tags into one big lists
  all_tags = df['POS TAGS'].sum()

  # counts each POS tag occurrence
  tag_counts = Counter(all_tags)

  # sorts the POS tags
  sorted_tag_counts = sorted(tag_counts, reverse = True)

  return tag_counts, sorted_tag_counts

In [19]:
### assumes the text content is in a column called 'TEXT'
def add_pos_tags(df):

  new_df = df.copy()

  new_df['POS TAG TOKENS'] = ''
  new_df['POS TAGS'] = ''
  new_df['POS TAGS STRING'] = ''

  for index, row in new_df.iterrows():
    cur_text = row['TEXT']
    tokenized_text = word_tokenize(cur_text)
    POS_tags = pos_tag(tokenized_text)
    tags_only = [tag for word,tag in POS_tags]

    row['POS TAG TOKENS'] = POS_tags
    row['POS TAGS'] = tags_only
    row['POS TAGS STRING'] = ' '.join(tags_only)

  return new_df

### For models

In [23]:
def convert_df_to_data(df):
  return df.values.tolist()

In [24]:
def create_and_train_model(x_train, y_train):

  # logistic regression with l2 regularization
  model = LogisticRegression(penalty = 'l2')

  # fitting the model
  model = model.fit(x_train, y_train)

  return model

In [25]:
def test_model(x_test, y_test, model):
  # predicted labels
  y_pred = model.predict(x_test)

  report = classification_report(y_test, y_pred, target_names = ['e', 'f'], output_dict = True)
  accuracy = round(report.get('accuracy') * 100, 2)

  return report, accuracy

### Analysis

In [108]:
### assumes the matrix starts after the 5th column
def sort_top_words(matrix_df, df_type, normalize = True):
  # isolates for the matrix
  if df_type == 'explainer':
    matrix = matrix_df.iloc[:, 7:]
  elif df_type == 'fiction':
    matrix = matrix_df.iloc[:, 10:]
  else:
    return

  # sums the values, result is a Pandas series
  sum_values = matrix.sum()

  if normalize:
    # divides the sums by the number of words
    sum_values = sum_values/len(sum_values)

  sorted_dict = sum_values.sort_values(ascending = False).to_dict()

  return sorted_dict

In [80]:
# function from https://github.com/dhmit/gender_novels/blob/master/gender_novels/analysis/dunning.py
def dunn_individual_word(total_words_in_corpus_1, total_words_in_corpus_2,
                         count_of_word_in_corpus_1,
                         count_of_word_in_corpus_2):
    '''
    applies dunning log likelihood to compare individual word in two counter objects

    :param word: desired word to compare
    :param m_corpus: c.filter_by_gender('male')
    :param f_corpus: c. filter_by_gender('female')
    :return: log likelihoods and p value
    >>> total_words_m_corpus = 8648489
    >>> total_words_f_corpus = 8700765
    >>> wordcount_female = 1000
    >>> wordcount_male = 50
    >>> dunn_individual_word(total_words_m_corpus,total_words_f_corpus,wordcount_male,wordcount_female)
    -1047.8610274053995
    '''
    a = count_of_word_in_corpus_1
    b = count_of_word_in_corpus_2
    c = total_words_in_corpus_1
    d = total_words_in_corpus_2

    e1 = c * (a + b) / (c + d)
    e2 = d * (a + b) / (c + d)

    dunning_log_likelihood = 2 * (a * math.log(a / e1) + b * math.log(b / e2))

    if count_of_word_in_corpus_1 * math.log(count_of_word_in_corpus_1 / e1) < 0:
        dunning_log_likelihood = -dunning_log_likelihood

    p = 1 - chi2.cdf(abs(dunning_log_likelihood),1)

    return dunning_log_likelihood

In [81]:
# function from https://github.com/dhmit/gender_novels/blob/master/gender_novels/analysis/dunning.py
def dunning_total(counter1, counter2, filename_to_pickle=None):
    '''
    runs dunning_individual on words shared by both counter objects
    (-) end of spectrum is words for counter_2
    (+) end of spectrum is words for counter_1
    the larger the magnitude of the number, the more distinctive that word is in its
    respective counter object

    use filename_to_pickle to store the result so it only has to be calculated once and can be
    used for multiple analyses.

    >>> from collections import Counter
    >>> female_counter = Counter({'he': 1,  'she': 10, 'and': 10})
    >>> male_counter =   Counter({'he': 10, 'she': 1,  'and': 10})
    >>> results = dunning_total(female_counter, male_counter)

    # Results is a dict that maps from terms to results
    # Each result dict contains the dunning score...
    >>> results['he']['dunning']
    -8.547243830635558

    # ... counts for corpora 1 and 2 as well as total count
    >>> results['he']['count_total'], results['he']['count_corp1'], results['he']['count_corp2']
    (11, 1, 10)

    # ... and the same for frequencies
    >>> results['he']['freq_total'], results['he']['freq_corp1'], results['he']['freq_corp2']
    (0.2619047619047619, 0.047619047619047616, 0.47619047619047616)

    :return: dict

    '''

    total_words_counter1 = 0
    total_words_counter2 = 0

    #get word total in respective counters
    for word1 in counter1:
        total_words_counter1 += counter1[word1]
    for word2 in  counter2:
        total_words_counter2 += counter2[word2]

    #dictionary where results will be returned
    dunning_result = {}
    for word in counter1:
        counter1_wordcount = counter1[word]
        if word in counter2:
            counter2_wordcount = counter2[word]


            if counter1_wordcount + counter2_wordcount < 10:
                continue

            dunning_word = dunn_individual_word( total_words_counter1,  total_words_counter2,
                                                 counter1_wordcount,counter2_wordcount)

            dunning_result[word] = {
                'dunning': dunning_word,
                'count_total': counter1_wordcount + counter2_wordcount,
                'count_corp1': counter1_wordcount,
                'count_corp2': counter2_wordcount,
                'freq_total': (counter1_wordcount + counter2_wordcount) / (total_words_counter1 +
                                                                           total_words_counter2),
                'freq_corp1': counter1_wordcount / total_words_counter1,
                'freq_corp2': counter2_wordcount / total_words_counter2
            }

    return dunning_result

In [137]:
def print_top_20_words(counts_dict, corpus):

  print('The top 20 words used in ' + corpus + ":")

  counter = 0
  for key in explainer_counts:
    if counter == 20:
      break
    print(str(key) + ': ' + str(explainer_counts[key]))
    counter += 1

# Tests

In [26]:
### adding the text of each article as a column in the dataframe

# science explainers
explainer_df = load_text_content(explainer_df, 'Science_Explainers/txt_files/')
count_text_length(explainer_df)

# fiction
fiction_df = load_text_content(fiction_df, 'txt_files/')
count_text_length(fiction_df)

## Vocabulary test

In [27]:
### adding the tf-idf values to the dataframes

explainer_tfidf_df = add_tf_idf(explainer_df, 'TEXT')
fiction_tfidf_df = add_tf_idf(fiction_df, 'TEXT')

In [122]:
### finding the top words of each corpora based on TF-IDF, and taking the words at their intersection

explainer_top_words_dict = sort_top_words(explainer_tfidf_df, 'explainer')
fiction_top_words_dict = sort_top_words(fiction_tfidf_df, 'fiction')

# top words across BOTH corpora
both_top_words_dict = {}
for key in explainer_top_words_dict:
  if key in fiction_top_words_dict:
    both_top_words_dict[key] = explainer_top_words_dict[key] + fiction_top_words_dict[key]

# taking the top 1000 words from each corpus
explainer_top_words = list(explainer_top_words_dict.keys())[:1000]
fiction_top_words = list(fiction_top_words_dict.keys())[:1000]

# TO DO: RE-SORT
common_words = list(set(explainer_top_words).intersection(set(fiction_top_words)))

# dfs with only the common words as column titles
explainer_common_words_df = refine_df_columns(common_words, explainer_tfidf_df)
fiction_common_words_df = refine_df_columns(common_words, fiction_tfidf_df)

In [31]:
### logistic regression with l2 regularization trained on word frequency

explainer_data = convert_df_to_data(explainer_common_words_df)
fiction_data = convert_df_to_data(fiction_common_words_df)

total_data = explainer_data + fiction_data
total_labels = ['e'] * len(explainer_data) + ['f'] * len(fiction_data)

x_train, x_test, y_train, y_test = train_test_split(total_data, total_labels)

model = create_and_train_model(x_train, y_train)

report, accuracy = test_model(x_test, y_test, model)

In [32]:
report

{'e': {'precision': 0.871900826446281,
  'recall': 0.9377777777777778,
  'f1-score': 0.9036402569593148,
  'support': 225},
 'f': {'precision': 0.9326923076923077,
  'recall': 0.8622222222222222,
  'f1-score': 0.8960739030023095,
  'support': 225},
 'accuracy': 0.9,
 'macro avg': {'precision': 0.9022965670692944,
  'recall': 0.9,
  'f1-score': 0.8998570799808121,
  'support': 450},
 'weighted avg': {'precision': 0.9022965670692943,
  'recall': 0.9,
  'f1-score': 0.8998570799808121,
  'support': 450}}

In [33]:
print('The model was ' + str(accuracy) + '% accurate at distinguishing literary fiction from science explainers')

The model was 90.0% accurate at distinguishing literary fiction from science explainers


## Part-of-Speech (POS) test

In [34]:
### POS tags

# adds POS tags to dataframes
explainer_pos_df = add_pos_tags(explainer_df)
fiction_pos_df = add_pos_tags(fiction_df)

In [35]:
### counting each tag occurrence per article
explainer_pos_df = add_tf_idf(explainer_pos_df, 'POS TAGS STRING', keep_punc = True)
fiction_pos_df = add_tf_idf(fiction_pos_df, 'POS TAGS STRING', keep_punc = True)

# sets all column titles (POS tags) to uppercase
explainer_pos_df = col_names_to_uppercase(explainer_pos_df)
fiction_pos_df = col_names_to_uppercase(fiction_pos_df)

In [36]:
explainer_pos_df.columns

Index(['FILENAME', 'TITLE', 'SOURCE', 'DATE PUBLISHED', 'URL', 'TEXT',
       'LENGTH', 'POS TAG TOKENS', 'POS TAGS', 'POS TAGS STRING', 'CC', 'CD',
       'DT', 'EX', 'FW', 'JJ', 'JJR', 'JJS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS',
       'PDT', 'POS', 'PRP', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'UH', 'VB', 'VBD',
       'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WRB'],
      dtype='object')

In [37]:
explainer_pos_df

Unnamed: 0,FILENAME,TITLE,SOURCE,DATE PUBLISHED,URL,TEXT,LENGTH,POS TAG TOKENS,POS TAGS,POS TAGS STRING,...,UH,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WRB
0,ATLANTIC/HowToSuccessfullySmashYourFace.txt,How to Successfully Smash Your Face Against a ...,ATLANTIC,2022-07-14,https://www.theatlantic.com/science/archive/20...,A new study refutes the widespread idea that w...,7105,"[(A, DT), (new, JJ), (study, NN), (refutes, VB...","[DT, JJ, NN, VBZ, DT, JJ, NN, IN, NNS, VBP, JJ...",DT JJ NN VBZ DT JJ NN IN NNS VBP JJ NNS . IN P...,...,0.000000,0.178939,0.115032,0.063907,0.092665,0.189154,0.118228,0.051524,0.028190,0.043716
1,ATLANTIC/WillCovidsSpringLullLast.txt,Will COVID’s Spring Lull Last?,ATLANTIC,2023-05-01,https://www.theatlantic.com/science/archive/20...,Things look calm right now. They may even stay...,8165,"[(Things, NNS), (look, VBP), (calm, JJ), (righ...","[NNS, VBP, JJ, RB, RB, ., PRP, MD, RB, VB, IN,...",NNS VBP JJ RB RB . PRP MD RB VB IN NN PRP VBD ...,...,0.000000,0.156607,0.089490,0.075507,0.081100,0.134683,0.125845,0.028184,0.009252,0.017659
2,ATLANTIC/TeenBrainsArePerfectlyCapable.txt,Teen Brains Are Perfectly Capable,ATLANTIC,2023-04-30,https://www.theatlantic.com/science/archive/20...,Teenagers have plenty of cognitive control. Th...,17450,"[(Teenagers, NNS), (have, VBP), (plenty, NN), ...","[NNS, VBP, NN, IN, JJ, NN, ., PRP, RB, VBP, JJ...",NNS VBP NN IN JJ NN . PRP RB VBP JJ NN RB VBP ...,...,0.016278,0.184493,0.057256,0.108151,0.053439,0.228513,0.153956,0.043598,0.040691,0.056240
3,ATLANTIC/TheFishHadGillsFullOf.txt,The Fish Had Gills Full of Ash and Gas Bubblin...,ATLANTIC,2023-04-29,https://www.theatlantic.com/science/archive/20...,How volcanoes kill fish is something out of a ...,5091,"[(How, WRB), (volcanoes, NNS), (kill, VB), (fi...","[WRB, NNS, VB, JJ, VBZ, NN, IN, IN, DT, NN, NN...",WRB NNS VB JJ VBZ NN IN IN DT NN NN DT NN VBD ...,...,0.000000,0.103028,0.197096,0.049274,0.103028,0.112360,0.098548,0.004514,0.019759,0.037714
4,ATLANTIC/LushPrairiesCouldReallyBeGreen.txt,Lush Prairies Could Really Be ‘Green Deserts’,ATLANTIC,2023-04-23,https://www.theatlantic.com/science/archive/20...,Climate change is stripping plants of their nu...,12049,"[(Climate, NNP), (change, NN), (is, VBZ), (str...","[NNP, NN, VBZ, VBG, NNS, IN, PRP$, NNS, ., DT,...",NNP NN VBZ VBG NNS IN PRP$ NNS . DT MD VB DT N...,...,0.006365,0.087562,0.095522,0.069651,0.089552,0.139767,0.125372,0.028078,0.024140,0.020943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,REUTERS/RightAgainEinsteinStudyShowsHow.txt,"Right again, Einstein! Study shows how antimat...",REUTERS,2023-09-27,https://www.reuters.com/science/right-again-ei...,An artist's conceptual rendering of antihydrog...,6261,"[(An, DT), (artist, NN), ('s, POS), (conceptua...","[DT, NN, POS, JJ, NN, IN, NN, NNS, VBG, RP, DT...",DT NN POS JJ NN IN NN NNS VBG RP DT NN IN DT J...,...,0.000000,0.113181,0.113181,0.054639,0.117084,0.058737,0.081959,0.035399,0.012912,0.004107
896,REUTERS/NasaAsteroidSampleParachutesSafelyOnto...,NASA asteroid sample parachutes safely onto Ut...,REUTERS,2023-09-24,https://www.reuters.com/science/nasas-first-as...,Sept 24 (Reuters) - A NASA space capsule carry...,6277,"[(Sept, $), (24, CD), ((, (), (Reuters, NNPS),...","[$, CD, (, NNPS, ), :, DT, NNP, NN, NN, VBG, D...",$ CD ( NNPS ) : DT NNP NN NN VBG DT JJS NN NN ...,...,0.000000,0.059013,0.125402,0.084831,0.106961,0.011102,0.047948,0.014868,0.004067,0.003882
897,REUTERS/MexicoResearchersShowProgressOnDrive.txt,Mexico researchers show progress on drive to r...,REUTERS,2023-09-22,https://www.reuters.com/markets/commodities/me...,[1/6]Corn cobs are pictured in a corn field at...,4909,"[([, RB), (1/6, CD), (], JJ), (Corn, NNP), (co...","[RB, CD, JJ, NNP, NN, VBP, VBN, IN, DT, NN, NN...",RB CD JJ NNP NN VBP VBN IN DT NN NN IN NNP NNP...,...,0.000000,0.116111,0.088245,0.078956,0.069667,0.032620,0.116111,0.009361,0.005122,0.004888
898,REUTERS/ZambiaFindShowsHumansHaveBuilt.txt,Zambia find shows humans have built with wood ...,REUTERS,2023-09-20,https://www.reuters.com/science/zambia-find-sh...,[1/4]Researchers uncover wooden artefacts on t...,5854,"[([, RB), (1/4, CD), (], JJ), (Researchers, NN...","[RB, CD, JJ, NNPS, RB, JJ, NNS, IN, DT, NNS, I...",RB CD JJ NNPS RB JJ NNS IN DT NNS IN DT NNP NN...,...,0.000000,0.068576,0.153287,0.096813,0.141186,0.036426,0.080677,0.024392,0.000000,0.008491


In [38]:
fiction_pos_df

Unnamed: 0,FILENAME,TITLE,SOURCE,DATE PUBLISHED,URL,TEXT,LENGTH,POS TAG TOKENS,POS TAGS,POS TAGS STRING,...,UH,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WRB
0,ATLANTIC/SheWhoRemembers.txt,She Who Remembers,ATLANTIC,2023-10-16,https://www.theatlantic.com/magazine/archive/2...,A short story\nThe Georgia men wake everyone i...,19220,"[(A, DT), (short, JJ), (story, NN), (The, DT),...","[DT, JJ, NN, DT, NNP, NNS, VBP, NN, IN, DT, JJ...",DT JJ NN DT NNP NNS VBP NN IN DT JJ NN . DT NN...,...,0.004069,0.106524,0.066344,0.058321,0.049634,0.142823,0.171970,0.007094,0.015647,0.024951
1,ATLANTIC/TheComebacker.txt,The Comebacker,ATLANTIC,2023-08-12,https://www.theatlantic.com/magazine/archive/2...,"A short story\nThe day was cold, cold even for...",35629,"[(A, DT), (short, JJ), (story, NN), (The, DT),...","[DT, JJ, NN, DT, NN, VBD, JJ, ,, JJ, RB, IN, N...","DT JJ NN DT NN VBD JJ , JJ RB IN NNP IN NNP NN...",...,0.002246,0.122258,0.386375,0.059200,0.057387,0.072103,0.051403,0.011748,0.012957,0.028608
2,ATLANTIC/ThePosting.txt,The Posting,ATLANTIC,2023-06-27,https://www.theatlantic.com/books/archive/2023...,A short story\nEverything overheard in those d...,53978,"[(A, DT), (short, JJ), (story, NN), (Everythin...","[DT, JJ, NN, NNP, NN, IN, DT, NNS, ., JJ, IN, ...","DT JJ NN NNP NN IN DT NNS . JJ IN DT NNS , PRP...",...,0.002060,0.097209,0.310787,0.088930,0.085336,0.058151,0.015512,0.009619,0.018194,0.033521
3,ATLANTIC/LatenightradioTalkshowHostTellsAll.txt,Late-Night-Radio Talk-Show Host Tells All,ATLANTIC,2023-05-29,https://www.theatlantic.com/books/archive/2023...,A short story\nDo I have rivals? Competitors? ...,12021,"[(A, DT), (short, JJ), (story, NN), (Do, NNP),...","[DT, JJ, NN, NNP, PRP, VBP, NNS, ., NNS, ., RB...",DT JJ NN NNP PRP VBP NNS . NNS . RB : DT NNS N...,...,0.000000,0.118601,0.220259,0.040315,0.104972,0.127006,0.056381,0.015034,0.025792,0.018982
4,ATLANTIC/TheRenovation.txt,The Renovation,ATLANTIC,2023-04-27,https://www.theatlantic.com/books/archive/2023...,A short story\nI didn’t know by what accident ...,39933,"[(A, DT), (short, JJ), (story, NN), (I, PRP), ...","[DT, JJ, NN, PRP, VBP, JJ, NN, VBN, IN, WP, VB...",DT JJ NN PRP VBP JJ NN VBN IN WP VBD DT NNS VB...,...,0.006267,0.144389,0.275826,0.080645,0.088461,0.095509,0.060481,0.015607,0.021086,0.026603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,THE_SUN/WhenTheyCameToUs.txt,WhenTheyCameToUs,THE_SUN,2016-08-01,https://www.thesunmagazine.org/issues/488/when...,They Arrive On A Warm Summer Night With No Bre...,33019,"[(They, PRP), (Arrive, VBP), (On, IN), (A, NNP...","[PRP, VBP, IN, NNP, NNP, NNP, NNP, IN, NNP, NN...",PRP VBP IN NNP NNP NNP NNP IN NNP NNP PRP VBD ...,...,0.002652,0.130960,0.356334,0.076035,0.061047,0.108414,0.039638,0.011230,0.032512,0.035032
896,THE_SUN/TheUnifiedConspiracyTheory.txt,TheUnifiedConspiracyTheory,THE_SUN,2016-07-01,https://www.thesunmagazine.org/issues/487/the-...,"When Jack walked into the Nite Owl Diner, I al...",30691,"[(When, WRB), (Jack, NNP), (walked, VBD), (int...","[WRB, NNP, VBD, IN, DT, NNP, NNP, NNP, ,, PRP,...","WRB NNP VBD IN DT NNP NNP NNP , PRP RB VBP JJ ...",...,0.000851,0.187005,0.370492,0.090291,0.076966,0.091370,0.038148,0.015258,0.020860,0.031909
897,THE_SUN/TheyWere.txt,TheyWere,THE_SUN,2016-07-01,https://www.thesunmagazine.org/issues/487/they...,He was. She was. They met and together they: d...,3048,"[(He, PRP), (was, VBD), (., .), (She, PRP), (w...","[PRP, VBD, ., PRP, VBD, ., PRP, VBD, CC, RB, P...",PRP VBD . PRP VBD . PRP VBD CC RB PRP : NN NN ...,...,0.000000,0.026087,0.515227,0.026262,0.137264,0.019674,0.019894,0.000000,0.013651,0.020094
898,THE_SUN/DaysOfHumanSacrifice.txt,DaysOfHumanSacrifice,THE_SUN,2016-06-01,https://www.thesunmagazine.org/issues/486/days...,"When I was in fifth grade, my parents attached...",23536,"[(When, WRB), (I, PRP), (was, VBD), (in, IN), ...","[WRB, PRP, VBD, IN, JJ, NN, ,, PRP$, NNS, VBD,...","WRB PRP VBD IN JJ NN , PRP$ NNS VBD DT NN NN I...",...,0.005954,0.137823,0.368349,0.061939,0.068243,0.067645,0.027527,0.012456,0.026616,0.026118


In [39]:
### tag counts
explainer_tag_counts, explainer_sorted_tag_counts = count_pos_tags(explainer_pos_df)
fiction_tag_counts, fiction_sorted_tag_counts = count_pos_tags(fiction_pos_df)

In [40]:
### finding the common POS tags and refining the dataframes

# common tags
explainer_pos_tags = explainer_pos_df.columns[10:]
fiction_pos_tags = fiction_pos_df.columns[10:]
common_tags = list(set(explainer_pos_tags).intersection(set(fiction_pos_tags)))

# dfs with only common words as column titles
explainer_common_tags_df = refine_df_columns(common_tags, explainer_pos_df)
fiction_common_tags_df = refine_df_columns(common_tags, fiction_pos_df)

In [41]:
### logistic regression with l2 regularization trained tag counts

explainer_data = convert_df_to_data(explainer_common_tags_df)
fiction_data = convert_df_to_data(fiction_common_tags_df)

total_data = explainer_data + fiction_data
total_labels = ['e'] * len(explainer_data) + ['f'] * len(fiction_data)

x_train, x_test, y_train, y_test = train_test_split(total_data, total_labels)

model = create_and_train_model(x_train, y_train)

report, accuracy = test_model(x_test, y_test, model)

In [42]:
report

{'e': {'precision': 0.9404255319148936,
  'recall': 0.9866071428571429,
  'f1-score': 0.962962962962963,
  'support': 224},
 'f': {'precision': 0.986046511627907,
  'recall': 0.9380530973451328,
  'f1-score': 0.961451247165533,
  'support': 226},
 'accuracy': 0.9622222222222222,
 'macro avg': {'precision': 0.9632360217714003,
  'recall': 0.9623301201011378,
  'f1-score': 0.962207105064248,
  'support': 450},
 'weighted avg': {'precision': 0.9633374017263181,
  'recall': 0.9622222222222222,
  'f1-score': 0.9622037456958092,
  'support': 450}}

In [43]:
print('The model was ' + str(accuracy) + '% accurate at distinguishing literary fiction from science explainers')

The model was 96.22% accurate at distinguishing literary fiction from science explainers


# More tests

In [128]:
explainer_dtm = add_dtm(explainer_df, 'TEXT')
fiction_dtm = add_dtm(fiction_df, 'TEXT')

In [129]:
explainer_counts = sort_top_words(explainer_dtm, 'explainer', normalize = False)
fiction_counts = sort_top_words(fiction_dtm, 'explainer', normalize = False)

In [140]:
dunning_total(explainer_counts, fiction_counts)

{'said': {'dunning': -960.408593735282,
  'count_total': 18660,
  'count_corp1': 4238,
  'count_corp2': 14422,
  'freq_total': 0.010899379915234833,
  'freq_corp1': 0.007492292026136395,
  'freq_corp2': 0.01258051459556027},
 'says': {'dunning': 551.3478408178496,
  'count_total': 5935,
  'count_corp1': 2836,
  'count_corp2': 3099,
  'freq_total': 0.0034666570094811756,
  'freq_corp1': 0.005013718779170085,
  'freq_corp2': 0.0027033015345750435},
 'study': {'dunning': 5199.322769976812,
  'count_total': 2973,
  'count_corp1': 2825,
  'count_corp2': 148,
  'freq_total': 0.0017365410765269645,
  'freq_corp1': 0.004994272056119707,
  'freq_corp2': 0.00012910249342275136},
 'people': {'dunning': 400.85432363796167,
  'count_total': 6048,
  'count_corp1': 2749,
  'count_corp2': 3299,
  'freq_total': 0.00353266075709219,
  'freq_corp1': 0.004859912878680734,
  'freq_corp2': 0.0028777643635247074},
 'university': {'dunning': 4579.479462658199,
  'count_total': 2817,
  'count_corp1': 2627,
  '

In [138]:
print_top_20_words(explainer_counts, 'science explainers')

The top 20 words used in science explainers:
said: 4238
says: 2836
study: 2825
people: 2749
university: 2627
like: 2604
new: 2512
researchers: 2213
scientists: 2129
years: 1951
research: 1915
species: 1810
time: 1748
climate: 1737
just: 1682
science: 1521
health: 1516
water: 1466
world: 1325
year: 1258


In [139]:
print_top_20_words(fiction_counts, 'short stories')

The top 20 words used in short stories:
said: 4238
says: 2836
study: 2825
people: 2749
university: 2627
like: 2604
new: 2512
researchers: 2213
scientists: 2129
years: 1951
research: 1915
species: 1810
time: 1748
climate: 1737
just: 1682
science: 1521
health: 1516
water: 1466
world: 1325
year: 1258
