# Processing text, extracting and visualising descriptive statistics

In [1]:
import os, re, glob, pickle, string
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import wordnet 


import spacy
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS #312 stopwords

java_path = r'/usr/lib/jvm/java-8-oracle/jre/bin/java'
os.environ['JAVAHOME'] = java_path
from nltk.parse.stanford import StanfordParser
scp = StanfordParser(path_to_jar='./supportdata/CoreNLP/stanford-corenlp-3.9.2.jar',
           path_to_models_jar='./supportdata/CoreNLP/stanford-english-corenlp-2018-10-05-models.jar')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn 

from pg_dataextraction import GutenbergCorpusBuilder, Author
import utils_tokeniser_vectoriser

[nltk_data] Downloading package punkt to /Users/k1000mbp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/k1000mbp/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/k1000mbp/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.


In [2]:
# a convenience function to load a previously pickled GutenbergCorpusBuilder object
def pickleloader(filename):
    # # open the file for writing
    fileObject = open(filename,'rb')
    
    return pickle.load(fileObject)  

    # here we close the fileObject
    fileObject.close()

PGcorpus = pickleloader("PG-eng-author-min3v2019424.pickle")
# run/re-run cells containing GutenbergCorpusBuilder and Author class before loading the pickle file
# to provide the unpickler attribute structures of the class for unpickling. 

### 0. Load the data

In [3]:
# load the json mongo exports for the authors and books info into pandas dataframes
authors_df = pd.read_json("./data/mongo_dumps/jsondump_authors_mongo.json")
books_df = pd.read_json("./data/mongo_dumps/jsondump_books_mongo.json")

# the books collection (imported into books_df) contains the eventual set of authors and books selected for the 
# corpus we want to join both dataframes on the set of authornums in books_df. 

# 1. make a copy of authors_df so we don't work on the original data
corpus_authors_df = authors_df.copy()
# 2. we set the index for copy of the authors_df to the values in "authornum"
corpus_authors_df = corpus_authors_df.set_index(corpus_authors_df["authornum"])
# 3. with the index set, we  can slice the dataframe to get only the authornums present in books_df
corpus_authors_df = corpus_authors_df.loc[list(books_df["authornum"].copy().unique())]
# 4. we also set the index for books df to authornums. we need this for the join below
books_df = books_df.set_index(books_df["authornum"])

# 5. use pd.concat for both dfs. use inner join. place the smaller df on the left. (we know all authornums in 
# books_df are present in corpus_authors_df)
corpus_authorbook_df = pd.concat([corpus_authors_df.copy(), books_df], axis=1, join="inner")

# 7. get the book txt file names 
filenames = glob.glob("./data/booksample_txt/*")
files=[]
for filename in filenames:
    _ = filename.rstrip(".txt")
    _ = _.split("/")[-1]
    authornum = _.split("_")[0]
    booknum = _.split("_")[1]
    file_dict ={"authornum":authornum, "booknum":booknum,"filename":filename }
    files.append(file_dict)
files_df = pd.DataFrame(files)

files_df.set_index(files_df["booknum"])
files_df.head(2)


# 8. since both dfs are of the same height use pd.merge for both dfs. 
# use inner join by default. but before that, corpus_author_df has booknum in
# int64, files_df has booknum as strings (from the split from the filename)
files_df["booknum"]=files_df["booknum"].astype("int64")
corpus_authorbook_df = pd.merge(corpus_authorbook_df.copy(),files_df, on="booknum")


# 9. add the book titles to the dataframe
titles = []
for row in corpus_authorbook_df.index:
    for booknum in corpus_authorbook_df["books_info"][row]: 
        if str(corpus_authorbook_df.loc[row,"booknum"]) == booknum:
            titles.append(corpus_authorbook_df.loc[row,"books_info"][booknum])
corpus_authorbook_df.loc[:,"booktitle"] = titles

# 10 drop the books_info and other columns that are not necessary
corpus_authorbook_df.drop(columns=["books_info", "_id", "authornum_x"], inplace=True)
corpus_authorbook_df.rename(columns={'authornum_y':'authornum'}, inplace = True)
corpus_authorbook_df.head(2)

Unnamed: 0,authorabstracts,authorname,literarymovements,wiki_info,booknum,selected_sents,authornum,filename,booktitle
0,"{'en': 'Christian Nephi Anderson (January 22, ...",Anderson Nephi,[lds fiction],{'en': 'http://en.wikipedia.org/wiki/Nephi_And...,16534,[When did Joseph visit Jackson county the seco...,a4501,./data/booksample_txt/a4501_16534.txt,A Young Folks' History of the Church of Jesus ...
1,"{'en': 'Christian Nephi Anderson (January 22, ...",Anderson Nephi,[lds fiction],{'en': 'http://en.wikipedia.org/wiki/Nephi_And...,17249,[It has been revealed to and tried by men in v...,a4501,./data/booksample_txt/a4501_17249.txt,Added Upon\rA Story (English) (as Author)


### 1. Preprocessing and Feature Engineering

##### 1a. Concreteness score 

In [None]:
concrete_df = pd.read_excel("./supportdata/ConcretenessRatings/Concreteness_ratings_Brysbaert_et_al_BRM.xlsx")
# there are "words" in the corpus that are actually bi-grams. e.g. baking soda 
concrete_df[concrete_df["Bigram"]==1].head(2)

In [None]:
concrete_df[concrete_df["Bigram"]==1]

### 2. Visualisation and descriptive statistics

### 3. Exporting the data