### Import Packages

In [13]:
import os
import json
import string
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import PlaintextCorpusReader, stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Create Document Corpus

In [14]:
corpus_dir = "./Literature-original"
corpus = PlaintextCorpusReader(corpus_dir, ".*\.txt")
file_names = corpus.fileids()
file_names

['Chronicles of Narnia. Prince Caspian.txt',
 'Chronicles of Narnia. The Horse and His Boy.txt',
 'Chronicles of Narnia. The Last Battle.txt',
 'Chronicles of Narnia. The Lion, the Witch and the Wardrobe.txt',
 'Chronicles of Narnia. The Magicians Nephew.txt',
 'Chronicles of Narnia. The Silver Chair.txt',
 'Chronicles of Narnia. The Voyage of the Dawn Treader.txt',
 'Fantastic Beasts and Where to Find Them.txt',
 'Fantastic Beasts. The Crimes of Grindelwald.txt',
 'Fantastic Beasts. The Secrets of Dumbledore.txt',
 'Harry Potter and the Chamber of Secrets.txt',
 'Harry Potter and the Deathly Hallows Part 1.txt',
 'Harry Potter and the Deathly Hallows Part 2.txt',
 'Harry Potter and the Goblet of Fire.txt',
 'Harry Potter and the Half-Blood Prince.txt',
 'Harry Potter and the Order of the Phoenix.txt',
 'Harry Potter and the Philosophers Stone.txt',
 'Harry Potter and the Prisoner of Azkaban.txt',
 'Twilight Saga. Breaking Dawn Part 1.txt',
 'Twilight Saga. Breaking Dawn Part 2.txt',
 

### Corpus documents preprocessing

In [15]:
documents = {}
for file_name in file_names:
    documents[file_name] = corpus.raw(file_name)
print(json.dumps(documents, indent=4, ensure_ascii=False))

{
    "Chronicles of Narnia. Prince Caspian.txt": "Peter, Susan, Edmund, and Lucy Pevensie are magically whisked away from a British railway station to a beach near an old and ruined castle. They determine the ruin is Cair Paravel, where they once ruled as the kings and queens of Narnia. They discover the treasure vault where Peter's sword and shield, Susan's bow and arrows, and Lucy's dagger and bottle of magical cordial are stored. Susan's horn for summoning help is missing, as she left it in the woods the day they returned to England after their prior visit to Narnia. Although only a year has passed in England, 1300 years have passed in Narnia.[a]\r\n\r\nThe children rescue Trumpkin the dwarf from soldiers who are about to drown him. Trumpkin tells the children Narnia's history since their disappearance: Telmarines conquered Narnia, which is now ruled by King Miraz and his wife, Queen Prunaprismia. Miraz usurped the throne by killing his brother, King Caspian IX, the father of Princ

In [16]:
lengths = {}
for file_name in documents:
    lengths[file_name] = {
        "pre": len(word_tokenize(documents[file_name]))
    } 
    
print(json.dumps(lengths, indent=4, ensure_ascii=False))

{
    "Chronicles of Narnia. Prince Caspian.txt": {
        "pre": 657
    },
    "Chronicles of Narnia. The Horse and His Boy.txt": {
        "pre": 850
    },
    "Chronicles of Narnia. The Last Battle.txt": {
        "pre": 1101
    },
    "Chronicles of Narnia. The Lion, the Witch and the Wardrobe.txt": {
        "pre": 793
    },
    "Chronicles of Narnia. The Magicians Nephew.txt": {
        "pre": 1250
    },
    "Chronicles of Narnia. The Silver Chair.txt": {
        "pre": 1275
    },
    "Chronicles of Narnia. The Voyage of the Dawn Treader.txt": {
        "pre": 1203
    },
    "Fantastic Beasts and Where to Find Them.txt": {
        "pre": 765
    },
    "Fantastic Beasts. The Crimes of Grindelwald.txt": {
        "pre": 761
    },
    "Fantastic Beasts. The Secrets of Dumbledore.txt": {
        "pre": 635
    },
    "Harry Potter and the Chamber of Secrets.txt": {
        "pre": 844
    },
    "Harry Potter and the Deathly Hallows Part 1.txt": {
        "pre": 796
    },
 

In [17]:
ps = PorterStemmer()

In [18]:
for file_name in documents:
    documents[file_name] = documents[file_name].lower()
    documents[file_name] = "".join([char for char in documents[file_name] if char not in string.punctuation])
    documents[file_name] = "".join([char for char in documents[file_name] if not char.isdigit()])
    documents[file_name] = " ".join([ps.stem(word) for word in word_tokenize(documents[file_name])])
    documents[file_name] = " ".join([word for word in word_tokenize(documents[file_name]) if word not in list(stopwords.words('english'))])
print(json.dumps(documents, indent=4, ensure_ascii=False))


{
    "Chronicles of Narnia. Prince Caspian.txt": "peter susan edmund luci pevensi magic whisk away british railway station beach near old ruin castl determin ruin cair paravel onc rule king queen narnia discov treasur vault peter sword shield susan bow arrow luci dagger bottl magic cordial store susan horn summon help miss left wood day return england prior visit narnia although onli year ha pass england year pass narniaa children rescu trumpkin dwarf soldier drown trumpkin tell children narnia histori sinc disappear telmarin conquer narnia rule king miraz hi wife queen prunaprismia miraz usurp throne kill hi brother king caspian ix father princ caspian miraz toler right heir princ caspian hi son wa born caspian escap miraz castl aid hi tutor doctor corneliu school lore old narnia gave queen susan horn caspian fled forest wa knock unconsci hi hors bolt awok den talk badger trufflehunt two dwarf nikabrik trumpkin accept caspian king badger dwarv took caspian meet mani creatur old narni

In [19]:
for file_name in documents:
    lengths[file_name]['post'] = len(word_tokenize(documents[file_name]))
print(json.dumps(lengths, indent=4, ensure_ascii=False))

{
    "Chronicles of Narnia. Prince Caspian.txt": {
        "pre": 657,
        "post": 339
    },
    "Chronicles of Narnia. The Horse and His Boy.txt": {
        "pre": 850,
        "post": 448
    },
    "Chronicles of Narnia. The Last Battle.txt": {
        "pre": 1101,
        "post": 562
    },
    "Chronicles of Narnia. The Lion, the Witch and the Wardrobe.txt": {
        "pre": 793,
        "post": 389
    },
    "Chronicles of Narnia. The Magicians Nephew.txt": {
        "pre": 1250,
        "post": 622
    },
    "Chronicles of Narnia. The Silver Chair.txt": {
        "pre": 1275,
        "post": 620
    },
    "Chronicles of Narnia. The Voyage of the Dawn Treader.txt": {
        "pre": 1203,
        "post": 595
    },
    "Fantastic Beasts and Where to Find Them.txt": {
        "pre": 765,
        "post": 416
    },
    "Fantastic Beasts. The Crimes of Grindelwald.txt": {
        "pre": 761,
        "post": 440
    },
    "Fantastic Beasts. The Secrets of Dumbledore.txt": {


In [20]:
lengths = pd.DataFrame.from_dict(lengths, 'index')

In [21]:
lengths['diff'] = lengths['pre'] - lengths['post']
lengths['pct'] = lengths['diff'] / lengths['pre']
lengths

Unnamed: 0,pre,post,diff,pct
Chronicles of Narnia. Prince Caspian.txt,657,339,318,0.484018
Chronicles of Narnia. The Horse and His Boy.txt,850,448,402,0.472941
Chronicles of Narnia. The Last Battle.txt,1101,562,539,0.489555
"Chronicles of Narnia. The Lion, the Witch and the Wardrobe.txt",793,389,404,0.509458
Chronicles of Narnia. The Magicians Nephew.txt,1250,622,628,0.5024
Chronicles of Narnia. The Silver Chair.txt,1275,620,655,0.513725
Chronicles of Narnia. The Voyage of the Dawn Treader.txt,1203,595,608,0.505403
Fantastic Beasts and Where to Find Them.txt,765,416,349,0.456209
Fantastic Beasts. The Crimes of Grindelwald.txt,761,440,321,0.421813
Fantastic Beasts. The Secrets of Dumbledore.txt,635,360,275,0.433071


### Create frequency matrix

In [22]:
docs = pd.DataFrame.from_dict(documents, orient='index')
docs.columns = ['content']
docs

Unnamed: 0,content
Chronicles of Narnia. Prince Caspian.txt,peter susan edmund luci pevensi magic whisk aw...
Chronicles of Narnia. The Horse and His Boy.txt,boy name shasta ha live life rememb southern p...
Chronicles of Narnia. The Last Battle.txt,western region narnia clever greedi ape shift ...
"Chronicles of Narnia. The Lion, the Witch and the Wardrobe.txt",peter susan edmund luci pevensi evacu london e...
Chronicles of Narnia. The Magicians Nephew.txt,stori begin london dure summer two children di...
Chronicles of Narnia. The Silver Chair.txt,eustac scrubb reform charact follow event voya...
Chronicles of Narnia. The Voyage of the Dawn Treader.txt,two youngest pevensi children luci edmund stay...
Fantastic Beasts and Where to Find Them.txt,british wizard magizoologist newton newt scama...
Fantastic Beasts. The Crimes of Grindelwald.txt,magic congress unit state america macusa trans...
Fantastic Beasts. The Secrets of Dumbledore.txt,albu dumbledor gellert grindelwald briefli mee...


In [23]:
cv = CountVectorizer()
matrix_tf = cv.fit_transform(docs['content'])
matrix_tf 

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 6019 stored elements and shape (23, 2503)>

In [24]:
sparsity_tf = 1-(matrix_tf.getnnz()/(matrix_tf.shape[0]*matrix_tf.shape[1]))
sparsity_tf

0.8954472024874498

In [25]:
tv = TfidfVectorizer()
matrix_tfidf = tv.fit_transform(docs['content'])
matrix_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6019 stored elements and shape (23, 2503)>

In [26]:
sparsity_tfidf = 1-(matrix_tfidf.getnnz()/(matrix_tfidf.shape[0]*matrix_tfidf.shape[1]))
sparsity_tfidf

0.8954472024874498

Directories for results

In [27]:
if not os.path.exists("./wodclouds"):
    os.mkdir("./wodclouds")
if not os.path.exists("./topic_modelling"):
    os.mkdir("./topic_modelling")
if not os.path.exists("./topic_modelling/topics"):
    os.mkdir("./topic_modelling/topics")
if not os.path.exists("./topic_modelling/documents"):
    os.mkdir("./topic_modelling/documents")
if not os.path.exists("./clustering"):
    os.mkdir("./clustering")
if not os.path.exists("./ngrams"):
    os.mkdir("./ngrams")

Worldclouds


Topic modeling

Clustering

N-grams