<a href="https://colab.research.google.com/github/faithrts/Science_Explainers/blob/main/analysis/analysis_setup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [2]:
### importing libraries

# basic libraries
import pandas as pd
from collections import Counter

# to download files
# from google.colab import files

import os
import re
import codecs

# sklearn libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin

# NLTK
import nltk
from nltk import word_tokenize
from nltk import pos_tag
from nltk.stem import SnowballStemmer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Importing datasets

In [3]:
### cloning git repos

!git clone https://github.com/faithrts/Science_Explainers
#!git clone https://github.com/dhmit/gender_novels
#!git clone https://github.com/faithrts/Short_Fiction

Cloning into 'Science_Explainers'...
remote: Enumerating objects: 1912, done.[K
remote: Counting objects: 100% (121/121), done.[K
remote: Compressing objects: 100% (90/90), done.[K
remote: Total 1912 (delta 73), reused 54 (delta 26), pack-reused 1791[K
Receiving objects: 100% (1912/1912), 97.86 MiB | 6.05 MiB/s, done.
Resolving deltas: 100% (861/861), done.
Updating files: 100% (20/20), done.


In [4]:
### saving datasets into dataframes

explainer_df = pd.read_csv('Science_Explainers/dataset/science_explainers_dataset.csv')
fiction_df = pd.read_csv('short_fiction_dataset.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'short_fiction_dataset.csv'

In [None]:
### unzipping science explainer files

!unzip Science_Explainers/dataset/science_txt_files.zip

In [None]:
### unzipping short fiction files

!unzip fiction_txt_files.zip

## Helper functions

### Data processing

In [5]:
### turns all column names to upper case
def uppercase_columns(df):
  columns = df.columns
  new_columns = [column.upper() for column in columns]
  df.columns = new_columns

In [6]:
### counts the word count of the text and adds it as a column
def count_text_length(df):
  df['LENGTH'] = ''

  for index, row in df.iterrows():
    text = row['TEXT']
    text_length = len(text)
    row['LENGTH'] = text_length

In [7]:
def load_text_content(df, path):

  # adds new column to the dataframe
  df['TEXT'] = ''

  for index, row in df.iterrows():
    cur_filename = row['FILENAME']

    # renaming files with weird accent characters in their names
    if 'í' in cur_filename and os.path.isfile(path + cur_filename.replace('í', 'í')):
      os.rename(path + cur_filename.replace('í', 'í'), path + cur_filename)
    if 'é' in cur_filename and os.path.isfile(path + cur_filename.replace('é', 'é')):
      os.rename(path + cur_filename.replace('é', 'é'), path + cur_filename)

    cur_article = codecs.open(path + cur_filename, 'r', encoding = 'utf8').read()

    # saving the text in the dataframe
    df.at[index, 'TEXT'] = cur_article

  return df

In [8]:
### custom pre-processor to eliminte numbers and instances of "_", "\", and "—"
def my_preprocessor(text):
    text = text.lower()
    text = re.sub('([0-9—_\\\\])', '', text)
    return text

In [9]:
### makes all the column names UPPERCASE
def col_names_to_uppercase(df):
  new_columns = [name.upper() for name in df.columns]
  df.columns = new_columns

  return df

In [10]:
class StemWords(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

  def fit(self, X, y=None):
    return self

  def transform(self, list_of_passages):
    # initializes the stemmer
    snowball_stemmer = SnowballStemmer('english')
    new_list_of_passages = []

    for passage in list_of_passages:
      # breaks the passage up into its component words
      words = nltk.word_tokenize(passage)
      new_words = [snowball_stemmer.stem(word) for word in words]

      new_passage = ' '.join(new_words)
      new_list_of_passages.append(new_passage)

    return new_list_of_passages

In [11]:
def refine_df_columns(list_of_titles, df):

  # the new df with only the columns to keep
  df_copy = df[list_of_titles]

  return df_copy

### Adding features to dataframes

In [12]:
def add_dtm(df, focus_col, keep_symbols = False):

 # using CountVectorizer to make a DTM based on the words in the corpus
  if keep_symbols:
    vectorizer = CountVectorizer(lowercase = False, token_pattern = '[A-Z]+\$*', min_df = 5)
  else:
    vectorizer = CountVectorizer(preprocessor = my_preprocessor, stop_words = 'english', min_df = 5)

  dtm = vectorizer.fit_transform(df[focus_col])
  words = vectorizer.get_feature_names_out()

  # converting sparse matrix to an array of arrays
  matrix = dtm.toarray()

  # combining the DTM with the metadata (associated word)
  DTM = pd.DataFrame(matrix, columns = words)

  # attaching the DTM to the original dataframe
  dtm_both = pd.concat([df, DTM], axis=1)

  return dtm_both

In [13]:
def add_tf_idf(df, focus_col):

  # using TfidfVectorizer to add the tf-idf values of each word to the dataframe
  vectorizer = TfidfVectorizer(preprocessor = my_preprocessor, stop_words = 'english', min_df = 5)

  tf_idf = vectorizer.fit_transform(df[focus_col])
  words = vectorizer.get_feature_names_out()

  # converting sparse matrix to an array of arrays
  matrix = tf_idf.toarray()

  # combining the tf-idf matrix with the metadata (associated words)
  TF_IDF = pd.DataFrame(matrix, columns = words)

  # attaches the tf-idf to the original dataframe
  tf_idf_both = pd.concat([df, TF_IDF], axis = 1)

  return tf_idf_both

In [14]:
### assumes the POS tags are in a column called 'POS TAGS'
def count_pos_tags(df):
  # concatenates all lists of POS tags into one big lists
  all_tags = df['POS TAGS'].sum()

  # counts each POS tag occurrence
  tag_counts = Counter(all_tags)

  # sorts the POS tags
  sorted_tag_counts = sorted(tag_counts, reverse = True)

  return tag_counts, sorted_tag_counts

In [44]:
### assumes the text content is in a column called 'TEXT'
def add_pos_tags(df, focus_col):

  new_df = df.copy()

  new_df['POS TAG TOKENS'] = ''
  new_df['POS TAGS'] = ''
  new_df['POS TAGS STRING'] = ''

  for index, row in df.iterrows():
    cur_text = row[focus_col]
    tokenized_text = word_tokenize(cur_text)
    POS_tags = pos_tag(tokenized_text)
    tags_only = [tag for word,tag in POS_tags]

    new_df.at[index, 'POS TAG TOKENS'] = POS_tags
    new_df.at[index, 'POS TAGS'] = tags_only
    new_df.at[index, 'POS TAGS STRING'] = ' '.join(tags_only)

  return new_df

In [16]:
def create_3_dfs(df, focus_col):
    dtm_df = add_dtm(df, focus_col).drop(columns = [focus_col])
    tfidf_df = add_tf_idf(df, focus_col).drop(columns = [focus_col])

    pos_df = add_pos_tags(df, focus_col).drop(columns = [focus_col])
    pos_df = add_dtm(pos_df, 'POS TAGS STRING', keep_symbols = True)

    return dtm_df, tfidf_df, pos_df

# Loading content

In [18]:
cur_df = pd.read_csv('scientific_papers_dataset.csv')

In [None]:
### adding the text of each article as a column in the dataframe

# science explainers
explainer_df = load_text_content(explainer_df, 'science_txt_files/')
count_text_length(explainer_df)

# fiction
fiction_df = load_text_content(fiction_df, 'fiction_txt_files/')
count_text_length(fiction_df)

## Extending dataframes

In [None]:
### adding the DTM, TF-IDF, and POS tag count to the dataframes

explainer_dtm_df, explainer_tfidf_df, explainer_pos_df = create_3_dfs(explainer_df, 'TEXT')
fiction_dtm_df, fiction_tfidf_df, fiction_pos_df = create_3_dfs(fiction_df, 'TEXT')

In [45]:
### adding the DTM, TF-IDF, and POS tag count to the given dataframe

cur_dtm_df, cur_tfidf_df, cur_pos_df = create_3_dfs(cur_df, 'TEXT')

# Downloading files

In [47]:
### pickling dataframes

import pickle

dfs_to_download = ['cur_dtm_df', 'cur_tfidf_df', 'cur_pos_df']

for df_name in dfs_to_download:
  filename = df_name.split('_df')[0] + '.pkl'

  with open(filename, 'wb') as cur_file:  # open a text file
    pickle.dump(df_name, cur_file) # serialize the list

  # eval(df_name).to_csv(filename, index = False, escapechar='\\')


In [None]:
### downloading csv of dataframes

dfs_to_download = ['explainer_dtm_df', 'fiction_dtm_df',
                   'explainer_tfidf_df', 'fiction_tfidf_df',
                   'explainer_pos_df', 'fiction_pos_df']

for df_name in dfs_to_download:
  filename = df_name.split('_df')[0] + '.csv'
  eval(df_name).to_csv(filename, index = False, escapechar='\\')
  files.download(filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>