## Setup Libraries

In [223]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

import warnings
warnings.filterwarnings('ignore')
import urllib.request

import re

stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/affan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/affan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load Dataset

In [5]:
# download
urllib.request.urlretrieve('http://www.gutenberg.org/files/11/11-0.txt', 'alice.txt')

('alice.txt', <http.client.HTTPMessage at 0x7f9ae36bf7c0>)

### Split Content for Each Chapter
We want to extract only the contents for each chapter. After some quick look, we found a pattern that each chapter started by `CHAPTER XX` title. And in the end of all chapter there is `THE END`. We can use both of them as switch to start and stop writing chapter's content.
<br><br>
Note that here we iterate dataset line by line.

In [100]:
# start trigger
start = 'CHAPTER'
# stop trigger
stop = 'THE END'

# all chapters
all_chapters = []
# chapter story
chapters = []
# write switch
write, writenew = False, False

for line in open('./alice.txt', 'r', encoding='utf-8').readlines():

    # found 'THE END'
    if stop in line:
        # add the last chapter
        all_chapters.append(' '.join(chapters))
        break
    
    # found new 'CHAPTER'
    if start in line:
        
        # trigger to write
        write = True
        writenew = True

        # collect existing chapter
        if len(chapters) > 0:
            all_chapters.append(' '.join(chapters))            
        
        # reset chapters
        chapters = []

    # chapter's content switch writer
    else:
        writenew = False

    # write chapter's content
    if write:
        chapters.append(line)    

In [110]:
print(all_chapters[12:][0][:300])

CHAPTER I.
 Down the Rabbit-Hole
 
 
 Alice was beginning to get very tired of sitting by her sister on the
 bank, and of having nothing to do: once or twice she had peeped into
 the book her sister was reading, but it had no pictures or
 conversations in it, “and what is the use of a book,” thought


Since we also captured `CHAPTER XX` in table of contents, we need to discard it and leave only the contents.

In [111]:
# exclude 12 first record
all_chapters = all_chapters[12:]

## Preprocessing
Remove:
- Punctuation
- Extra whitespaces
- Non-Ascii
- Number

Preprocess:
- Case folding
- Tokenize
- Stopword removal
- Pos Tagging
- Lemmatize

In [114]:
# cleaner
def rm_punct(text):
    return re.sub(r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]', ' ', text)

def rm_whitespaces(text):
    return re.sub(r' +', ' ', text)

def rm_nonascii(text):
    return re.sub(r'[^\x00-\x7f]', r'', text)

def rm_number(text):
    return re.sub(r'\d+', '', text)

def clean_pipeline(text):
    no_punct = rm_punct(text)
    no_number = rm_number(no_punct)
    no_whitespaces = rm_whitespaces(no_number)
    no_nonascii = rm_nonascii(no_whitespaces)
    return no_nonascii

In [313]:
# preprocesser
def casefold(text):
    return text.lower()

def tokenize(text):
    return word_tokenize(text)

def rm_stopwords(text):
    return [i for i in text if i not in stopwords]

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()    
    lemmas = [lemmatizer.lemmatize(t) for t in text]
    # make sure lemmas does not contains sotpwords
    return rm_stopwords(lemmas)

def preprocess_pipeline(text, join=True):
    tokens = tokenize(casefold(text))
    no_stopwords = rm_stopwords(tokens)
    lemmas = lemmatize(no_stopwords)

    return ' '.join(lemmas) if join else lemmas

In [340]:
clean_txt = [clean_pipeline(chapter) for chapter in all_chapters]
clean_txt = [preprocess_pipeline(chapter) for chapter in clean_txt]

clean_txt[0][:300]

'chapter rabbit hole alice beginning get tired sitting sister bank nothing twice peeped book sister reading picture conversation use book thought alice without picture conversation considering mind well could hot day made feel sleepy stupid whether pleasure making daisy chain would worth trouble gett'

## TF-IDF

In this task, we asked to find the top 10 important words from each chapter that is not `Alice` word.
<br><br>
So, first we need to remove `Alice` from all chapters.

In [203]:
# no alice
no_alice = [' '.join([word for word in row.split(' ') if word.lower() != 'alice']) for row in clean_txt]

In [204]:
tfidf = TfidfVectorizer()
vec = tfidf.fit_transform(no_alice).toarray()

df = pd.DataFrame(vec, columns=tfidf.get_feature_names())
df

Unnamed: 0,abide,able,absence,absurd,acceptance,accident,accidentally,account,accounting,accusation,...,youall,youare,youd,youll,young,youre,youth,youve,zealand,zigzag
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.013184,0.0,0.0,0.032442,0.0
1,0.0,0.031073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.031073,0.032146,0.021159,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.026332,0.030661,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.020878,0.024921,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.025765,0.0,0.013328,0.0,0.0,0.010471,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.04785,0.01595,0.057114,0.140543,0.014463,0.0,0.023424
5,0.022229,0.0,0.0,0.01909,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015136,0.0,0.045167,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015728,0.0,0.0,...,0.0,0.0,0.018946,0.01247,0.01247,0.007442,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018725,0.0,0.0,...,0.0,0.0,0.022557,0.0,0.0,0.0,0.0,0.013463,0.0,0.0
8,0.0,0.0,0.0203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.010501,0.0,0.027646,0.016499,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017609,0.0,...,0.0,0.0,0.009109,0.0,0.0,0.007156,0.0,0.021745,0.0,0.0


### Top 10 Most Important Words from each Chapter

In [221]:
N = 10

for i, row in df.iterrows():
    top_10 = ', '.join(row.sort_values(ascending=False)[:N].index.tolist())
    print(f'Chapter {i+1}: {top_10}')        

Chapter 1: little, bat, door, rabbit, key, way, eat, hole, think, like
Chapter 2: mouse, pool, little, im, swam, cat, dear, said, foot, mabel
Chapter 3: said, mouse, dodo, race, prize, lory, dry, thimble, know, bird
Chapter 4: bill, little, window, rabbit, puppy, chimney, glove, bottle, fan, said
Chapter 5: caterpillar, said, pigeon, serpent, im, youth, egg, size, father, little
Chapter 6: said, footman, cat, baby, mad, duchess, wow, like, pig, cook
Chapter 7: hatter, dormouse, said, hare, march, tea, twinkle, time, draw, treacle
Chapter 8: queen, said, hedgehog, king, gardener, soldier, cat, five, rose, executioner
Chapter 9: turtle, said, mock, gryphon, duchess, moral, queen, went, school, say
Chapter 10: turtle, mock, gryphon, said, lobster, dance, soup, beautiful, join, whiting
Chapter 11: king, hatter, said, court, dormouse, witness, jury, queen, juror, officer
Chapter 12: said, king, jury, queen, sister, slate, dream, would, rabbit, fit


## What Alice do most often?

In [334]:
verbs_tag = ['VB', 'VBG', 'VBD', 'VBN', 'VBP', 'VBZ']
all_verbs = []
lemmatizer = WordNetLemmatizer()

for chapter in all_chapters:
    # split sentences for each chapter
    sent = sent_tokenize(casefold(chapter))

    # iterate for tokenized sentences
    for s in sent:
        # only process if sentence contains 'alice'
        if 'alice' in s:
            # clean sentence
            s_clean = clean_pipeline(s)
            # preprocess sentence
            s_prep = preprocess_pipeline(s_clean, join=False)            
            # pos tagging sentence
            s_tags = nltk.pos_tag(s_prep)
            # filter verbs based on predefined tags
            s_verbs = [verb for verb,tag in s_tags if tag in verbs_tag]
            # lemmatize filtered verbs
            s_lverbs = [lemmatizer.lemmatize(v, 'v') for v in s_verbs]
            # save lemmatized verbs
            all_verbs.append(s_lverbs)    

# check results
all_verbs[0][:5]

['begin', 'get', 'sit', 'peep', 'read']

In [339]:
from collections import Counter

# flatten
flatten_verbs = [x for l in all_verbs for x in l]

# get top 10
top10 = Counter(flatten_verbs).most_common(10)

print('Top 10 Alice do most often:')
for verb, freq in top10:
    print(f'{verb}, {freq} times.')

Top 10 Alice do most often:
say, 295 times.
go, 91 times.
think, 59 times.
get, 57 times.
look, 48 times.
come, 43 times.
know, 42 times.
begin, 41 times.
see, 33 times.
make, 32 times.
