<a href="https://colab.research.google.com/github/faithrts/Science_Explainers/blob/main/analysis/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [81]:
### importing libraries

# basic libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

# to download files
from google.colab import files

import os
import re
import ast
import math
import codecs

# for word2vec
from gensim.models import Word2Vec

# scipy
from scipy.stats import chi2  ## for stats
from scipy import spatial     ## for cosine similarity

# sklearn libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.decomposition import LatentDirichletAllocation

# NLTK
import nltk
from nltk import word_tokenize
from nltk import pos_tag
from nltk.stem import SnowballStemmer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Importing databases

In [2]:
### cloning git repos

!git clone https://github.com/faithrts/Science_Explainers
#!git clone https://github.com/dhmit/gender_novels
#!git clone https://github.com/faithrts/Short_Fiction

Cloning into 'Science_Explainers'...
remote: Enumerating objects: 1155, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 1155 (delta 13), reused 22 (delta 8), pack-reused 1124[K
Receiving objects: 100% (1155/1155), 95.91 MiB | 4.92 MiB/s, done.
Resolving deltas: 100% (179/179), done.
Updating files: 100% (917/917), done.


In [3]:
### importing dataframes:
# [explainer/fiction]_dtm_df
# [explainer/fiction]_tfidf_df
# [explainer/fiction]_pos_df

explainer_dtm_df = pd.read_csv('Science_Explainers/analysis/explainer_dtm.csv')
explainer_tfidf_df = pd.read_csv('Science_Explainers/analysis/explainer_tfidf.csv')
explainer_pos_df = pd.read_csv('Science_Explainers/analysis/explainer_pos.csv')

fiction_dtm_df = pd.read_csv('Science_Explainers/analysis/fiction_dtm.csv')
fiction_tfidf_df = pd.read_csv('Science_Explainers/analysis/fiction_tfidf.csv')
fiction_pos_df = pd.read_csv('Science_Explainers/analysis/fiction_pos.csv')

## Helper functions

### Data processing

In [4]:
### makes all the column names UPPERCASE
def col_names_to_uppercase(df):
  new_columns = [name.upper() for name in df.columns]
  df.columns = new_columns

  return df

In [5]:
class StemWords(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

  def fit(self, X, y=None):
    return self

  def transform(self, list_of_passages):
    # initializes the stemmer
    snowball_stemmer = SnowballStemmer('english')
    new_list_of_passages = []

    for passage in list_of_passages:
      # breaks the passage up into its component words
      words = nltk.word_tokenize(passage)
      new_words = [snowball_stemmer.stem(word) for word in words]

      new_passage = ' '.join(new_words)
      new_list_of_passages.append(new_passage)

    return new_list_of_passages

In [6]:
def refine_df_columns(list_of_titles, df):

  # the new df with only the columns to keep
  df_copy = df[list_of_titles]

  return df_copy

### Modelling

In [7]:
### custom pre-processor to eliminte numbers and instances of "_", "\", and "—"
def my_preprocessor(text):
    text = text.lower()
    text = re.sub('([0-9—_\\\\])', '', text)
    return text

In [8]:
def convert_df_to_data(df):
  return df.values.tolist()

In [82]:
def create_and_train_log_reg_l2(x_train, y_train, max_iterations = 5000):

  # logistic regression with l2 regularization
  model = LogisticRegression(penalty = 'l2', max_iter = max_iterations)

  # fitting the model
  model = model.fit(x_train, y_train)

  return model

In [83]:
def create_and_train_lda(x_train, topics = 10, max_iterations = 5000):

  # logistic regression with l2 regularization
  model = LatentDirichletAllocation(n_components = topics, random_state = 11)

  # fitting the model
  model = model.fit_transform(x_train)

  return model

In [101]:
def test_predictive_model(x_test, y_test, model):
  # predicted labels
  y_pred = model.predict(x_test)

  # gets performance in terms of precision, recall, accuracy, etc.
  report = classification_report(y_test, y_pred, target_names = ['e', 'f'], output_dict = True)
  accuracy = round(report.get('accuracy') * 100, 2)

  comparison = y_pred == y_test
  incorrect_indices = [index for index, value in enumerate(comparison) if value == False]

  return report, incorrect_indices

In [96]:
def train_test_split_helper(explainer_feature_df, fiction_feature_df, ran = 11):

  # adds the file identity to the end of each datapoint for later identification of miscategorized files
  if 'FILENAME' not in explainer_feature_df.columns:
    explainer_feature_df['FILENAME'] = explainer_dtm_df['FILENAME']
  if 'FILENAME' not in fiction_feature_df.columns:
    fiction_feature_df['FILENAME'] = fiction_dtm_df['FILENAME']

  # converts the corpora's word counts for each document into lists of feature values
  explainer_data = convert_df_to_data(explainer_feature_df)
  fiction_data = convert_df_to_data(fiction_feature_df)

  # combines the explainer and fiction data
  total_data = explainer_data + fiction_data
  # creates target labels for the data
  total_labels = ['e'] * len(explainer_data) + ['f'] * len(fiction_data)

  # splits the data into training and testing (random state to keep results consistent across re-runs)
  x_train, x_test, y_train, y_test = train_test_split(total_data, total_labels, random_state = ran)

  # retrieves the filenames for later identification (removes from data before input to model)
  filenames_train = [datapoint.pop() for datapoint in x_train]
  filenames_test = [datapoint.pop() for datapoint in x_test]

  return x_train, x_test, y_train, y_test, filenames_train, filenames_test

### Analysis

In [11]:
### assumes the matrix starts after the 5th column
def sort_top_words(matrix_df, normalize = True):

  # sums the values, result is a Pandas series
  sum_values = matrix_df.sum()

  if normalize:
    # divides the sums by the number of words
    sum_values = sum_values/len(sum_values)

  sorted_dict = sum_values.sort_values(ascending = False).to_dict()

  return sorted_dict

In [12]:
# function from https://github.com/dhmit/gender_novels/blob/master/gender_novels/analysis/dunning.py
def dunn_individual_word(total_words_in_corpus_1, total_words_in_corpus_2,
                         count_of_word_in_corpus_1,
                         count_of_word_in_corpus_2):
    '''
    applies dunning log likelihood to compare individual word in two counter objects

    :param word: desired word to compare
    :param m_corpus: c.filter_by_gender('male')
    :param f_corpus: c. filter_by_gender('female')
    :return: log likelihoods and p value
    >>> total_words_m_corpus = 8648489
    >>> total_words_f_corpus = 8700765
    >>> wordcount_female = 1000
    >>> wordcount_male = 50
    >>> dunn_individual_word(total_words_m_corpus,total_words_f_corpus,wordcount_male,wordcount_female)
    -1047.8610274053995
    '''
    a = count_of_word_in_corpus_1
    b = count_of_word_in_corpus_2
    c = total_words_in_corpus_1
    d = total_words_in_corpus_2

    e1 = c * (a + b) / (c + d)
    e2 = d * (a + b) / (c + d)

    dunning_log_likelihood = 2 * (a * math.log(a / e1) + b * math.log(b / e2))

    if count_of_word_in_corpus_1 * math.log(count_of_word_in_corpus_1 / e1) < 0:
        dunning_log_likelihood = -dunning_log_likelihood

    p = 1 - chi2.cdf(abs(dunning_log_likelihood),1)

    return dunning_log_likelihood

In [13]:
# function from https://github.com/dhmit/gender_novels/blob/master/gender_novels/analysis/dunning.py
def dunning_total(counter1, counter2, filename_to_pickle=None):
    '''
    runs dunning_individual on words shared by both counter objects
    (-) end of spectrum is words for counter_2
    (+) end of spectrum is words for counter_1
    the larger the magnitude of the number, the more distinctive that word is in its
    respective counter object

    use filename_to_pickle to store the result so it only has to be calculated once and can be
    used for multiple analyses.

    >>> from collections import Counter
    >>> female_counter = Counter({'he': 1,  'she': 10, 'and': 10})
    >>> male_counter =   Counter({'he': 10, 'she': 1,  'and': 10})
    >>> results = dunning_total(female_counter, male_counter)

    # Results is a dict that maps from terms to results
    # Each result dict contains the dunning score...
    >>> results['he']['dunning']
    -8.547243830635558

    # ... counts for corpora 1 and 2 as well as total count
    >>> results['he']['count_total'], results['he']['count_corp1'], results['he']['count_corp2']
    (11, 1, 10)

    # ... and the same for frequencies
    >>> results['he']['freq_total'], results['he']['freq_corp1'], results['he']['freq_corp2']
    (0.2619047619047619, 0.047619047619047616, 0.47619047619047616)

    :return: dict

    '''

    total_words_counter1 = 0
    total_words_counter2 = 0

    #get word total in respective counters
    for word1 in counter1:
        total_words_counter1 += counter1[word1]
    for word2 in  counter2:
        total_words_counter2 += counter2[word2]

    #dictionary where results will be returned
    dunning_result = {}
    for word in counter1:
        counter1_wordcount = counter1[word]
        if word in counter2:
            counter2_wordcount = counter2[word]


            if counter1_wordcount + counter2_wordcount < 10:
                continue

            dunning_word = dunn_individual_word( total_words_counter1,  total_words_counter2,
                                                 counter1_wordcount,counter2_wordcount)

            dunning_result[word] = {
                'dunning': dunning_word,
                'count_total': counter1_wordcount + counter2_wordcount,
                'count_corp1': counter1_wordcount,
                'count_corp2': counter2_wordcount,
                'freq_total': (counter1_wordcount + counter2_wordcount) / (total_words_counter1 +
                                                                           total_words_counter2),
                'freq_corp1': counter1_wordcount / total_words_counter1,
                'freq_corp2': counter2_wordcount / total_words_counter2
            }

    return dunning_result

In [14]:
def run_dunning_log_likelihood_test(count_dict_1, count_dict_2):

  # runs the comparison
  dunning_comparison = dunning_total(count_dict_1, count_dict_2)

  # extracting the dunning values for each word
  dunning_values = {}
  for term in dunning_comparison:
    dunning_values[term] = dunning_comparison[term]['dunning']

  return dunning_comparison, dunning_values

In [15]:
def create_top_dunning_dict(dunning_values, length = ''):
  # the sorted keys of the dunning_values dictionary
  sorted_keys_desc = sorted(dunning_values, key = dunning_values.get, reverse = True)
  sorted_keys_asc = sorted(dunning_values, key = dunning_values.get, reverse = False)

  top_dunning_dict_desc = {}
  top_dunning_dict_asc = {}

  # if no dictionary length specified or specified length too long, do all
  if length == '' or length > len(sorted_keys_desc):
    length = len(sorted_keys_desc)

  for i in range(length):
    cur_word_desc = sorted_keys_desc[i]
    cur_word_asc = sorted_keys_asc[i]

    top_dunning_dict_desc[cur_word_desc] = dunning_values[cur_word_desc]
    top_dunning_dict_asc[cur_word_asc] = dunning_values[cur_word_asc]

  return top_dunning_dict_desc, top_dunning_dict_asc

In [88]:
def compute_cosine_similarity(vector_1, vector_2):
  return 1 - spatial.distance.cosine(vector_1, vector_2)

In [95]:
def find_closest_vectors(list_of_vectors):

  # a list to contain each vector's similarity to every other
  vector_sim_list = [[1]*len(list_of_vectors) for i in range(len(list_of_vectors))]

  # the indices and similarity value of the most similar vectors
  closest_vector_indices = []
  closest_sim = float('inf')

  # iterates through the vectors
  for i in range(len(list_of_vectors)):
    for j in range(len(list_of_vectors)):
      if i > j:
        continue

      vector_1 = list_of_vectors[i]
      vector_2 = list_of_vectors[j]

      cos_sim = compute_cosine_similarity(vector_1, vector_2)

      if cos_sim < closest_sim:
        closest_sim = cos_sim
        closest_vector_indices = [i, j]

      vector_sim_list[i][j] = cos_sim
      vector_sim_list[j][i] = cos_sim

  return vector_sim_list, closest_vector_indices

# Tests

## Vocabulary test

In [16]:
### finding the top words of the two corpora based on TF-IDF, and taking the words at their intersection

# getting a dictionary of the top words in each corpus
explainer_top_words_dict = sort_top_words(explainer_dtm_df.iloc[:, 6:])
fiction_top_words_dict = sort_top_words(fiction_dtm_df.iloc[:, 6:])

# the top 1000 words from each corpus
explainer_top_words = list(explainer_top_words_dict.keys())[:1000]
fiction_top_words = list(fiction_top_words_dict.keys())[:1000]

# TO DO: RE-SORT?
common_words = list(set(explainer_top_words).intersection(set(fiction_top_words)))

# dfs with only the common words as column titles
explainer_common_words_df = refine_df_columns(common_words, explainer_dtm_df)
fiction_common_words_df = refine_df_columns(common_words, fiction_dtm_df)

In [102]:
### logistic regression with l2 regularization trained on word frequency

# gets training data, testing data, and filenames (of each datapoint in order)
x_train, x_test, y_train, y_test, filenames_train, filenames_test = train_test_split_helper(explainer_common_words_df, fiction_common_words_df, ran = 11)

# creates a model
model = create_and_train_log_reg_l2(x_train, y_train)

# performance report
common_words_report, common_words_incorrect_indices = test_predictive_model(x_test, y_test, model)
accuracy = round(common_words_report.get('accuracy') * 100, 2)

print('The model was ' + str(accuracy) + '% accurate at distinguishing literary fiction from science explainers')

The model was 96.22% accurate at distinguishing literary fiction from science explainers


In [50]:
### finding miscategorized texts
common_words_miscategorized_files = [filenames_test[i] for i in common_words_incorrect_indices]
common_words_miscategorized_files

['GLOBE_AND_MAIL/ScreenTime101HowMuchScreen.txt',
 'MASSIVE_SCI/AButterflysWingsAreThePerfect.txt',
 'CNN/MessyEatingHabitsMightRevealElusive.txt',
 'CBC/TheDarkSkyBluesLightPollution.txt',
 'LIT_HUB/TheLastIsland.txt',
 'CNN/TheMaleLonelinessEpidemicAndHow.txt',
 'NEW_YORKER/TheSecretSource.txt',
 'NATIONAL_OBSERVER/GiantJurassiceraInsectRediscoveredHangingOutside.txt',
 'NATIONAL_OBSERVER/TheFinalDaysOfTheBahama.txt',
 'MASSIVE_SCI/DarkMatterMakesUpAQuarter.txt',
 'ELECTRIC/PoetryFlashFictionAndGraphicNarrative.txt',
 'GRANTA/TheUnfolding.txt',
 'NPR/TakeAPeekAtWhatNasa.txt',
 'CBC/HowIslandStudiesResearchBecameA.txt',
 'NATIONAL_OBSERVER/AnishinaabeArtistReinventsTheLoungeChair.txt',
 'ATLANTIC/ForTheLichens.txt',
 'MASSIVE_SCI/NeuronsDieWithGrace.txt']

In [107]:
### finding the two closest miscategorized texts

# the feature vectors of the miscategorized texts
miscategorized_datapoints = [x_test[i] for i in common_words_incorrect_indices]

# computing cosine similarity
vector_sim_list, closest_vector_indices = find_closest_vectors(miscategorized_datapoints)

common_words_closest_1 = common_words_miscategorized_files[closest_vector_indices[0]]
common_words_closest_2 = common_words_miscategorized_files[closest_vector_indices[1]]

print(f'The two closests miscategorized texts are:\n\t{common_words_closest_1}\n\t{common_words_closest_2}')

The two closests miscategorized texts are:
	CNN/MessyEatingHabitsMightRevealElusive.txt
	MASSIVE_SCI/NeuronsDieWithGrace.txt


## Part-of-Speech (POS) test

In [51]:
### finding the common POS tags and refining the dataframes

# finding the POS tags that both corpora have in common (should be most)
explainer_pos_tags = explainer_pos_df.columns[10:]
fiction_pos_tags = fiction_pos_df.columns[10:]
common_tags = list(set(explainer_pos_tags).intersection(set(fiction_pos_tags)))

# dfs with only common words as column titles
explainer_common_tags_df = refine_df_columns(common_tags, explainer_pos_df)
fiction_common_tags_df = refine_df_columns(common_tags, fiction_pos_df)

In [115]:
### logistic regression with l2 regularization on tag counts

# gets training data, testing data, and filenames (of each datapoint in order)
x_train, x_test, y_train, y_test, filenames_train, filenames_test = train_test_split_helper(explainer_common_tags_df, fiction_common_tags_df, ran = 11)

# creates a model
model = create_and_train_log_reg_l2(x_train, y_train, max_iterations = 5000)

# performance report
common_tags_report, common_tags_incorrect_indices = test_predictive_model(x_test, y_test, model)
accuracy = round(common_tags_report.get('accuracy') * 100, 2)

print('The model was ' + str(accuracy) + '% accurate at distinguishing literary fiction from science explainers')

The model was 97.33% accurate at distinguishing literary fiction from science explainers


In [116]:
### finding miscategorized texts
common_tags_miscategorized_files = [filenames_test[i] for i in common_tags_incorrect_indices]
common_tags_miscategorized_files

['GLOBE_AND_MAIL/HowAScientistGotAToronto.txt',
 'AGNI/BerlinStory.txt',
 'LIT_HUB/TheMarchers.txt',
 'NPR/HowManyFriendsDoAmericansHave.txt',
 'NATIONAL_OBSERVER/GiantJurassiceraInsectRediscoveredHangingOutside.txt',
 'NATIONAL_GEOGRAPHIC/YourCatCanRecognizeYourVoice.txt',
 'TIN_HOUSE/AsCloseAsWeDare.txt',
 'NATIONAL_OBSERVER/IfYouSmellLikeAFlower.txt',
 'CBC/HowIslandStudiesResearchBecameA.txt',
 'NATIONAL_OBSERVER/InAFamedGameParkNear.txt',
 'NATIONAL_OBSERVER/AnishinaabeArtistReinventsTheLoungeChair.txt',
 'ATLANTIC/ForTheLichens.txt']

In [117]:
### finding the two closest miscategorized texts

# the feature vectors of the miscategorized texts
miscategorized_datapoints = [x_test[i] for i in common_tags_incorrect_indices]

# computing cosine similarity
vector_sim_list, closest_vector_indices = find_closest_vectors(miscategorized_datapoints)

common_tags_closest_1 = common_tags_miscategorized_files[closest_vector_indices[0]]
common_tags_closest_2 = common_tags_miscategorized_files[closest_vector_indices[1]]

print(f'The two closests miscategorized texts are:\n\t{common_tags_closest_1}\n\t{common_tags_closest_2}')

The two closests miscategorized texts are:
	LIT_HUB/TheMarchers.txt
	TIN_HOUSE/AsCloseAsWeDare.txt


# More tests

## Dunning log likelihood on word counts



In [65]:
### comparing the statistical frequency of words in each corpus

explainer_pure_counts_dict = sort_top_words(explainer_dtm_df.iloc[:, 6:], normalize = False)
fiction_pure_counts_dict = sort_top_words(fiction_dtm_df.iloc[:, 6:], normalize = False)

dunning_comparison_words, dunning_values_words, = run_dunning_log_likelihood_test(explainer_pure_counts_dict, fiction_pure_counts_dict)
top_dunning_dict_words_desc, top_dunning_dict_words_asc = create_top_dunning_dict(dunning_values_words, length = 30)

In [66]:
print('Top 30 words stastistically associated with science explainers:\n')
top_dunning_dict_words_desc

Top 30 words stastistically associated with science explainers:



{'study': 5104.343204609907,
 'researchers': 4714.655534641639,
 'university': 4506.952314390828,
 'scientists': 4340.23863716685,
 'climate': 3560.3324034697393,
 'species': 3518.9428318879777,
 'research': 3517.719043635208,
 'science': 2671.5113001982268,
 'health': 2504.5615093243623,
 'canada': 2150.2713843493316,
 'data': 2112.520313626611,
 'cells': 1479.308425307749,
 'national': 1448.183178707155,
 'disease': 1312.2113784926116,
 'including': 1303.176969471153,
 'published': 1301.2496937442756,
 'studies': 1299.1320890845639,
 'humans': 1280.3984441644582,
 'risk': 1269.2652873705413,
 'human': 1223.3963828111303,
 'industry': 1209.5360687461855,
 'based': 1145.9677530204356,
 'carbon': 1135.8097013755698,
 'covid': 1129.6442305695898,
 'change': 1126.5420864028968,
 'animals': 1095.4982146606594,
 'bacteria': 1079.0795576187668,
 'team': 1019.4589134300767,
 'brain': 1004.6561554202101,
 'effects': 992.1736876925908}

In [67]:
print('Top 30 words stastistically associated with short stories:\n')
top_dunning_dict_words_asc

Top 30 words stastistically associated with short stories:



{'mother': -3023.28931675129,
 'father': -2657.4010680871115,
 'man': -2364.381056203116,
 'didn': -2338.610516809078,
 'eyes': -1522.0459201106764,
 'room': -1492.0554550144768,
 'woman': -1491.5692496877564,
 'house': -1471.8983915010847,
 'like': -1433.9349075468945,
 'did': -1384.9876569887242,
 'asked': -1381.43744392407,
 'know': -1323.8277319116664,
 'door': -1322.2731998061631,
 'felt': -1300.7665203797783,
 'knew': -1285.1860549975902,
 'head': -1261.4412310307446,
 'wanted': -1245.733858584267,
 'went': -1239.6604893059957,
 'boy': -1220.1553355084038,
 'hand': -1217.958686327311,
 'looked': -1181.0098110927838,
 'tell': -1168.5345348550711,
 'face': -1128.137764731308,
 'got': -1113.6636462202891,
 'said': -1037.9499078796698,
 'night': -985.4249468693233,
 'couldn': -985.330580615611,
 'girl': -981.3402547096658,
 'hands': -974.9604799873531,
 'let': -971.7541887497721}

In [72]:
### preparing to do predictive modelling again

# the top 60 defining words from the respective corpora
top_60_words = list(top_dunning_dict_words_desc.keys()) + list(top_dunning_dict_words_asc.keys())

# dfs with only common words as column titles
explainer_top_words_df = refine_df_columns(top_60_words, explainer_dtm_df)
fiction_top_words_df = refine_df_columns(top_60_words, fiction_dtm_df)

In [118]:
### logistic regression with l2 regularization on top statistical words

# gets training data, testing data, and filenames (of each datapoint in order)
x_train, x_test, y_train, y_test, filenames_train, filenames_test = train_test_split_helper(explainer_top_words_df, fiction_top_words_df, ran = 55)

# creates a model
model = create_and_train_log_reg_l2(x_train, y_train)

# performance report
top_words_report, top_words_incorrect_indices = test_predictive_model(x_test, y_test, model)
accuracy = round(top_words_report.get('accuracy') * 100, 2)

print('The model was ' + str(accuracy) + '% accurate at distinguishing literary fiction from science explainers')

The model was 98.67% accurate at distinguishing literary fiction from science explainers


In [119]:
### finding miscategorized texts
top_words_miscategorized_files = [filenames_test[i] for i in top_words_incorrect_indices]
top_words_miscategorized_files

['NPR/TheNewWorldsHottestPepperPepper.txt',
 'ELECTRIC/ThisApocalypseBroughtToUsBy.txt',
 'CBC/ANewBookLaysOutWhy.txt',
 'PARIS_REVIEW/FromAnUnfinishedNovel.txt',
 'NPR/2PeopleWereHurtInA.txt',
 'TIN_HOUSE/TheMissAprilHouses.txt']

In [120]:
### finding the two closest miscategorized texts

# the feature vectors of the miscategorized texts
miscategorized_datapoints = [x_test[i] for i in top_words_incorrect_indices]

# computing cosine similarity
vector_sim_list, closest_vector_indices = find_closest_vectors(miscategorized_datapoints)

top_words_closest_1 = top_words_miscategorized_files[closest_vector_indices[0]]
top_words_closest_2 = top_words_miscategorized_files[closest_vector_indices[1]]

print(f'The two closests miscategorized texts are:\n\t{top_words_closest_1}\n\t{top_words_closest_2}')

The two closests miscategorized texts are:
	CBC/ANewBookLaysOutWhy.txt
	NPR/2PeopleWereHurtInA.txt


## Dunning log likelihood + topic modelling

## Dunning log likelihood on verbs

## Dunning log likelihood on POS tags

In [76]:
### comparing the statistical frequency of words in each corpus

explainer_pure_pos_dict = sort_top_words(explainer_pos_df.iloc[:, 9:], normalize = False)
fiction_pure_pos_dict = sort_top_words(fiction_pos_df.iloc[:, 9:], normalize = False)

dunning_comparison_pos, dunning_values_pos, = run_dunning_log_likelihood_test(explainer_pure_pos_dict, fiction_pure_pos_dict)
top_dunning_dict_pos_desc, top_dunning_dict_pos_asc = create_top_dunning_dict(dunning_values_pos, length = 15)

In [77]:
### POS tags associated with science explainer corpus
print('Top 15 POS tags stastistically associated with science explainers:\n')
top_dunning_dict_pos_desc

Top 15 POS tags stastistically associated with science explainers:



{'NNS': 26057.40560076355,
 'NNP': 11880.227142034673,
 'CD': 10358.688977236981,
 'POS': 5866.264038410277,
 'JJ': 3695.9474049878736,
 'VBZ': 2706.8421883201554,
 'WDT': 2395.65120182971,
 'JJR': 2373.730070614861,
 'NNPS': 1938.1229419804401,
 'IN': 1882.0549391403838,
 'RBR': 1018.112661011126,
 'JJS': 742.0706699665345,
 'RBS': 689.8235226748163,
 'VBN': 538.1044151086189,
 'MD': 258.905485093258}

In [78]:
### POS tags associated with short fiction corpus
print('Top 15 POS tags stastistically associated with short stories:\n')
top_dunning_dict_pos_asc

Top 15 POS tags stastistically associated with short stories:



{'PRP': -58501.13846990699,
 'VBD': -33769.209531262546,
 'PRP$': -15658.14752117962,
 'RP': -1875.6255851084898,
 'RB': -1667.2321902912172,
 'WP': -751.8584655212762,
 'UH': -646.7086309936711,
 'WRB': -487.07678088331113,
 'PDT': -308.55895399069243,
 'CC': -216.6937950232268,
 'VB': -114.69331468380369,
 'EX': -24.95786316926302,
 'WP$': -6.8933092378428,
 'VBP': -0.05825205682171486,
 'NN': 0.49129599942045843}