#### 1.	Download Alice in Wonderland by Lewis Carroll from Project Gutenberg's website http://www.gutenberg.org/files/11/11-0.txt


In [61]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

nltk.download('wordnet')
nltk.download("stopwords")
nltk.download('punkt')
english_stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikita\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikita\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [62]:
CHAPTERS = [
"CHAPTER I",
"CHAPTER II",
"CHAPTER III",
"CHAPTER IV",
"CHAPTER V",
"CHAPTER VI",
"CHAPTER VII",
"CHAPTER VIII",
"CHAPTER IX",
"CHAPTER X",
"CHAPTER XI",
"CHAPTER XII",
"THE END",
]

In [63]:
# prepare

with open('res/alice_main.txt', 'r', encoding='utf8') as f:
    full_text = f.read()

#### 2.	Perform any necessary preprocessing on the text, including converting to lower case, removing stop words, numbers / non-alphabetic characters, lemmatization.

In [64]:
chapters_data = []
for i in range(0, len(CHAPTERS) - 1):
    l = CHAPTERS[i]
    r = CHAPTERS[i + 1]
    l_idx = full_text.index(l)
    r_idx = full_text.index(r)
    chapter = full_text[l_idx:r_idx]
    chapter_name_idx = chapter.index("\n")
    chapter_name = chapter[:chapter_name_idx]
    chapter_body = chapter[chapter_name_idx + 1:]
    chapter_body = chapter_body.lower()
    chapter_body = chapter_body.split("\n", 1)[1]
    chapter_body = re.sub('\s+', ' ', chapter_body)
    chapter_body = "".join(filter(lambda x: x.isspace() or x.isalpha(), chapter_body))
    tokenized_chapter = word_tokenize(chapter_body)
    words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    tokens = " ".join([lemmatizer.lemmatize(token) for token in tokenized_chapter if token not in words])
    chapters_data.append((chapter_name, tokens))



#### 3.	Find Top 10 most important (for example, in terms of TF-IDF metric) words from each chapter in the text (not "Alice");

In [65]:
for (c, t) in chapters_data:
    print(c, t)

CHAPTER I. alice beginning get tired sitting sister bank nothing twice peeped book sister reading picture conversation use book thought alice without picture conversation considering mind well could hot day made feel sleepy stupid whether pleasure making daisychain would worth trouble getting picking daisy suddenly white rabbit pink eye ran close nothing remarkable alice think much way hear rabbit say oh dear oh dear shall late thought afterwards occurred ought wondered time seemed quite natural rabbit actually took watch waistcoatpocket looked hurried alice started foot flashed across mind never seen rabbit either waistcoatpocket watch take burning curiosity ran across field fortunately time see pop large rabbithole hedge another moment went alice never considering world get rabbithole went straight like tunnel way dipped suddenly suddenly alice moment think stopping found falling deep well either well deep fell slowly plenty time went look wonder going happen next first tried look ma

In [66]:
for (c, t) in chapters_data:
    classifier = TfidfVectorizer(stop_words = ['alice', 'english'])
    traineds =  classifier.fit_transform([t])
    ids = np.argsort(traineds.toarray()).flatten()[::-1]
    print("{}: {}".format(c, " ".join(np.array(classifier.get_feature_names())[ids][:10])))

CHAPTER I.: little like think way see one door said could time
CHAPTER II.: little mouse im said go dear thing foot went oh
CHAPTER III.: said mouse dodo know one soon bird lory prize dry
CHAPTER IV.: little said one rabbit bill thought sure heard get window
CHAPTER V.: said caterpillar im serpent pigeon little well minute ive know
CHAPTER VI.: said cat like little duchess much footman mad know baby
CHAPTER VII.: said hatter dormouse march hare time know well thing one
CHAPTER VIII.: said queen head king three cat like one hedgehog went
CHAPTER IX.: said turtle mock gryphon duchess queen went dont never say
CHAPTER X.: said gryphon turtle mock would lobster dance beautiful soup voice
CHAPTER XI.: said king hatter court dormouse one witness queen rabbit thought
CHAPTER XII.: said king would queen little jury know rabbit white one


### how would you name each chapter according to the identified tokens?

1) way little door

2) go dear

3) mouse & bird lory

4) little rabbit

5) caterpillar serpent pigeon

6) cat like little duchess

7) hatter dormouse

8) queen

9) turtle mock gryphon

10) lobster dance

11) king hatter

12) little jury

#### 4.	Find the Top 10 most used verbs in sentences with Alice. What does Alice do most often?

In [67]:
text = " ".join([i for (_, i) in chapters_data]).split()

In [68]:
nltk.download('averaged_perceptron_tagger')
n = 4
verbs = {}
for i in range(len(text) - n):
    t = [text[i + p] for p in range(n)]
    speech_parts = nltk.pos_tag(t)
    if 'alice' not in t:
        continue
    for (w, s) in speech_parts:
        if s[0] == "V":
            if w not in verbs:
                verbs[w] = 1
            else:
                verbs[w] += 1

for i in list(reversed(sorted(verbs.items(), key=lambda item: item[1])))[:10]:
    print(i)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nikita\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


('said', 474)
('thought', 99)
('went', 44)
('looked', 43)
('replied', 39)
('began', 36)
('got', 24)
('say', 22)
('felt', 22)
('know', 21)
