<a href="https://colab.research.google.com/github/faithrts/Science_Explainers/blob/main/analysis/word_and_pos_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
### importing libraries

# basic libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

# to download files
from google.colab import files

import os
import re
import ast
import math
import codecs
import pickle

# for word2vec
from gensim.models import Word2Vec

# scipy
from scipy.stats import chi2  ## for stats
from scipy import spatial     ## for cosine similarity

# sklearn libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.decomposition import LatentDirichletAllocation

# NLTK
import nltk
from nltk import word_tokenize
from nltk import pos_tag
from nltk.stem import SnowballStemmer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# silencing the warnings
pd.options.mode.chained_assignment = None  # default='warn'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# Importing datasets

In [None]:
### cloning git repos

!git clone https://github.com/faithrts/Science_Explainers
#!git clone https://github.com/faithrts/Short_Fiction

Cloning into 'Science_Explainers'...
remote: Enumerating objects: 1993, done.[K
remote: Counting objects: 100% (202/202), done.[K
remote: Compressing objects: 100% (151/151), done.[K
remote: Total 1993 (delta 102), reused 116 (delta 40), pack-reused 1791[K
Receiving objects: 100% (1993/1993), 100.63 MiB | 6.15 MiB/s, done.
Resolving deltas: 100% (890/890), done.
Filtering content: 100% (16/16), 847.55 MiB | 53.13 MiB/s, done.
Encountered 2 file(s) that should have been pointers, but weren't:
	dataset/article_urls.csv
	dataset/science_explainers_dataset.csv


In [None]:
### importing dataframes of preprocessed data

dataset_ids = ['explainer', 'fiction', 'news', 'sci_paper']
data_types = ['sw_dtm', 'dtm', 'tfidf', 'pos']

for dataset in dataset_ids:
  for d_type in data_types:

    path_to_csv = f'Science_Explainers/dataset/preprocessed_data/{dataset}_{d_type}.csv'

    # creates a variable name and assigns it the df
    # e.g.,
    # explainer_dtm_df = pd.read_csv(Science_Explainers/dataset/preprocessed_data/explainer_dtm.csv)
    globals()[f'{dataset}_{d_type}_df'] = pd.read_csv(path_to_csv)

## Helper functions

### Data processing

In [None]:
### makes all the column names UPPERCASE
def col_names_to_uppercase(df):
  new_columns = [name.upper() for name in df.columns]
  df.columns = new_columns

  return df

In [None]:
def refine_df_columns(df, list_of_titles):

  # the new df with only the columns to keep
  df_copy = df[list_of_titles]

  return df_copy

### Modelling

In [None]:
### custom pre-processor to eliminte numbers and instances of "_" and "\"
def my_preprocessor(text):
    text = text.lower()
    text = re.sub('([0-9_\\\\])', '', text)
    return text

In [None]:
def convert_df_to_data(df):
  return df.values.tolist()

### Analysis

In [None]:
### assumes the matrix starts after the 5th column
def sort_top_words(matrix_df, normalize = True):

  # sums the values, result is a Pandas series
  sum_values = matrix_df.sum()

  if normalize:
    # divides the sums by the number of words
    sum_values = sum_values/len(sum_values)

  sorted_dict = sum_values.sort_values(ascending = False).to_dict()

  return sorted_dict

In [None]:
# function from https://github.com/dhmit/gender_novels/blob/master/gender_novels/analysis/dunning.py
def dunn_individual_word(total_words_in_corpus_1, total_words_in_corpus_2,
                         count_of_word_in_corpus_1,
                         count_of_word_in_corpus_2):
    '''
    applies dunning log likelihood to compare individual word in two counter objects

    :param word: desired word to compare
    :param m_corpus: c.filter_by_gender('male')
    :param f_corpus: c. filter_by_gender('female')
    :return: log likelihoods and p value
    >>> total_words_m_corpus = 8648489
    >>> total_words_f_corpus = 8700765
    >>> wordcount_female = 1000
    >>> wordcount_male = 50
    >>> dunn_individual_word(total_words_m_corpus,total_words_f_corpus,wordcount_male,wordcount_female)
    -1047.8610274053995
    '''
    a = count_of_word_in_corpus_1
    b = count_of_word_in_corpus_2
    c = total_words_in_corpus_1
    d = total_words_in_corpus_2

    e1 = c * (a + b) / (c + d)
    e2 = d * (a + b) / (c + d)

    dunning_log_likelihood = 2 * (a * math.log(a / e1) + b * math.log(b / e2))

    if count_of_word_in_corpus_1 * math.log(count_of_word_in_corpus_1 / e1) < 0:
        dunning_log_likelihood = -dunning_log_likelihood

    p = 1 - chi2.cdf(abs(dunning_log_likelihood),1)

    return dunning_log_likelihood

In [None]:
# function from https://github.com/dhmit/gender_novels/blob/master/gender_novels/analysis/dunning.py
def dunning_total(counter1, counter2, filename_to_pickle=None):
    '''
    runs dunning_individual on words shared by both counter objects
    (-) end of spectrum is words for counter_2
    (+) end of spectrum is words for counter_1
    the larger the magnitude of the number, the more distinctive that word is in its
    respective counter object

    use filename_to_pickle to store the result so it only has to be calculated once and can be
    used for multiple analyses.

    >>> from collections import Counter
    >>> female_counter = Counter({'he': 1,  'she': 10, 'and': 10})
    >>> male_counter =   Counter({'he': 10, 'she': 1,  'and': 10})
    >>> results = dunning_total(female_counter, male_counter)

    # Results is a dict that maps from terms to results
    # Each result dict contains the dunning score...
    >>> results['he']['dunning']
    -8.547243830635558

    # ... counts for corpora 1 and 2 as well as total count
    >>> results['he']['count_total'], results['he']['count_corp1'], results['he']['count_corp2']
    (11, 1, 10)

    # ... and the same for frequencies
    >>> results['he']['freq_total'], results['he']['freq_corp1'], results['he']['freq_corp2']
    (0.2619047619047619, 0.047619047619047616, 0.47619047619047616)

    :return: dict

    '''

    total_words_counter1 = 0
    total_words_counter2 = 0

    #get word total in respective counters
    for word1 in counter1:
        total_words_counter1 += counter1[word1]
    for word2 in  counter2:
        total_words_counter2 += counter2[word2]

    #dictionary where results will be returned
    dunning_result = {}
    for word in counter1:
        counter1_wordcount = counter1[word]
        if word in counter2:
            counter2_wordcount = counter2[word]


            if counter1_wordcount + counter2_wordcount < 10:
                continue

            dunning_word = dunn_individual_word( total_words_counter1,  total_words_counter2,
                                                 counter1_wordcount,counter2_wordcount)

            dunning_result[word] = {
                'dunning': dunning_word,
                'count_total': counter1_wordcount + counter2_wordcount,
                'count_corp1': counter1_wordcount,
                'count_corp2': counter2_wordcount,
                'freq_total': (counter1_wordcount + counter2_wordcount) / (total_words_counter1 +
                                                                           total_words_counter2),
                'freq_corp1': counter1_wordcount / total_words_counter1,
                'freq_corp2': counter2_wordcount / total_words_counter2
            }

    return dunning_result

In [None]:
def run_dunning_log_likelihood_test(count_dict_1, count_dict_2):

  # runs the comparison
  dunning_comparison = dunning_total(count_dict_1, count_dict_2)

  # extracting the dunning values for each word
  dunning_values = {}
  for term in dunning_comparison:
    dunning_values[term] = dunning_comparison[term]['dunning']

  return dunning_comparison, dunning_values

In [None]:
def create_top_dunning_dict(dunning_values, length = ''):
  # the sorted keys of the dunning_values dictionary
  sorted_keys_desc = sorted(dunning_values, key = dunning_values.get, reverse = True)
  sorted_keys_asc = sorted(dunning_values, key = dunning_values.get, reverse = False)

  top_dunning_dict_desc = {}
  top_dunning_dict_asc = {}

  # if no dictionary length specified or specified length too long, do all
  if length == '' or length > len(sorted_keys_desc):
    length = len(sorted_keys_desc)

  for i in range(length):
    cur_word_desc = sorted_keys_desc[i]
    cur_word_asc = sorted_keys_asc[i]

    top_dunning_dict_desc[cur_word_desc] = dunning_values[cur_word_desc]
    top_dunning_dict_asc[cur_word_asc] = dunning_values[cur_word_asc]

  return top_dunning_dict_desc, top_dunning_dict_asc

# Tests

## Dunning log likelihood on word counts



In [None]:
from tabulate import tabulate

In [None]:
def compare_corpora(matrix_df_1, matrix_df_2, verbose = False, filename = ''):
  counts_dict_1 = sort_top_words(matrix_df_1, normalize = False)
  counts_dict_2 = sort_top_words(matrix_df_2, normalize = False)

  dunn_comparison_words, dunn_value_words = run_dunning_log_likelihood_test(counts_dict_1, counts_dict_2)
  desc_dunn_words_dict, asc_dunn_words_dict = create_top_dunning_dict(dunn_value_words, length = 30)

  df_1_scores = desc_dunn_words_dict.items()
  df_2_scores = asc_dunn_words_dict.items()

  if verbose:
    print('Corpus 1:')
    print(tabulate(df_1_scores, headers = ['word', 'score']))

    print('')

    print('Corpus 2:')
    print(tabulate(df_2_scores, headers = ['word', 'score']))

  if filename != '':
    with open(filename, 'w+') as cur_file:
      cur_file.write(tabulate(df_1_scores, headers = ['Word', 'Score']))
      cur_file.write('\n')
      cur_file.write(tabulate(df_2_scores, headers = ['Word', 'Score']))

In [None]:
compare_corpora(explainer_pos_df.iloc[:, 9:], fiction_pos_df.iloc[:, 9:], filename = 'pos_fiction.txt')

In [None]:
compare_corpora(explainer_pos_df.iloc[:, 9:], news_pos_df.iloc[:, 6:], filename = 'pos_news.txt')

In [None]:
compare_corpora(explainer_pos_df.iloc[:, 9:], sci_paper_pos_df.iloc[:, 6:], filename = 'pos_sci_paper.txt')

## Dunning log likelihood on POS tags

In [None]:
### comparing the statistical frequency of words in each corpus

explainer_pure_pos_dict = sort_top_words(explainer_pos_df.iloc[:, 9:], normalize = False)
fiction_pure_pos_dict = sort_top_words(fiction_pos_df.iloc[:, 9:], normalize = False)

dunning_comparison_pos, dunning_values_pos, = run_dunning_log_likelihood_test(explainer_pure_pos_dict, fiction_pure_pos_dict)
top_dunning_dict_pos_desc, top_dunning_dict_pos_asc = create_top_dunning_dict(dunning_values_pos, length = 15)

In [None]:
### POS tags associated with science explainer corpus
print('Top 15 POS tags (science explainers):\n')
top_dunning_dict_pos_desc

Top 15 POS tags (science explainers):



{'NNS': 21760.6281917015,
 'CD': 6613.039143529279,
 'NNP': 5130.4600554325625,
 'POS': 4741.982076744441,
 'JJ': 3369.6783182861036,
 'VBZ': 3218.556339050927,
 'JJR': 2417.7164420526924,
 'WDT': 2335.607703333897,
 'IN': 1667.5106853350699,
 'NNPS': 1604.582961501133,
 'RBR': 1026.4570937808144,
 'JJS': 655.310095308324,
 'VBN': 513.2617484916518,
 'RBS': 470.53268677068615,
 'MD': 332.5171186622274}

In [None]:
### POS tags associated with short fiction corpus
print('Top 15 POS tags (short stories):\n')
top_dunning_dict_pos_asc

Top 15 POS tags (short stories):



{'PRP': -46320.77891296836,
 'VBD': -26336.026567843386,
 'PRP$': -13040.276514577181,
 'RP': -1575.2704498059793,
 'RB': -1065.6819441864463,
 'UH': -529.9823236867871,
 'WP': -358.9196823132338,
 'WRB': -274.0735662347638,
 'PDT': -246.87920259144562,
 'CC': -189.31592856545194,
 'FW': -85.6488511310958,
 'VB': -22.21861838888094,
 'WP$': -4.113230055418221,
 'NN': -2.5493220451248817,
 'EX': -0.5988371577817659}