# Text Analysis Using NLTK

## 1. Reading Sample Data

In [62]:
import nltk
from nltk.corpus import gutenberg
gutenberg_files = gutenberg.fileids()
gutenberg_files

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [63]:
gutenberg_doc = gutenberg.open('shakespeare-hamlet.txt').read()
gutenberg_doc



## 2. Tokenize the Text

### Alphabetical list of part-of-speech tags used in the Penn Treebank Project:
https://www.cis.upenn.edu/~treebank/

```
CC Coordinating conjunction
CD Cardinal number
DT Determiner
EX Existential there
FW Foreign word
IN Preposition or subordinating conjunction
JJ Adjective
JJR Adjective, comparative
JJS Adjective, superlative
LS List item marker
MD Modal
NN Noun, singular or mass
NNS Noun, plural
NNP Proper noun, singular
NNPS Proper noun, plural
PDT Predeterminer
POS Possessive ending
PRP Personal pronoun
PRP$ Possessive pronoun
RB Adverb
RBR Adverb, comparative
RBS Adverb, superlative
RP Particle
SYM Symbol
TO to
UH Interjection
VB Verb, base form
VBD Verb, past tense
VBG Verb, gerund or present participle
VBN Verb, past participle
VBP Verb, non­3rd person singular present
VBZ Verb, 3rd person singular present
WDT Wh­determiner
WP Wh­pronoun
WP$ Possessive wh­pronoun
WRB Wh­adverb
```

#### gutenberg_doc을 tokenize하세요

In [64]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

gutenberg_tokens = nltk.word_tokenize(gutenberg_doc)
gutenberg_tagged = nltk.pos_tag(gutenberg_tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mobis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mobis\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [65]:
gutenberg_tokens

['[',
 'The',
 'Tragedie',
 'of',
 'Hamlet',
 'by',
 'William',
 'Shakespeare',
 '1599',
 ']',
 'Actus',
 'Primus',
 '.',
 'Scoena',
 'Prima',
 '.',
 'Enter',
 'Barnardo',
 'and',
 'Francisco',
 'two',
 'Centinels',
 '.',
 'Barnardo',
 '.',
 'Who',
 "'s",
 'there',
 '?',
 'Fran',
 '.',
 'Nay',
 'answer',
 'me',
 ':',
 'Stand',
 '&',
 'vnfold',
 'your',
 'selfe',
 'Bar',
 '.',
 'Long',
 'liue',
 'the',
 'King',
 'Fran',
 '.',
 'Barnardo',
 '?',
 'Bar',
 '.',
 'He',
 'Fran',
 '.',
 'You',
 'come',
 'most',
 'carefully',
 'vpon',
 'your',
 'houre',
 'Bar',
 '.',
 "'T",
 'is',
 'now',
 'strook',
 'twelue',
 ',',
 'get',
 'thee',
 'to',
 'bed',
 'Francisco',
 'Fran',
 '.',
 'For',
 'this',
 'releefe',
 'much',
 'thankes',
 ':',
 "'T",
 'is',
 'bitter',
 'cold',
 ',',
 'And',
 'I',
 'am',
 'sicke',
 'at',
 'heart',
 'Barn',
 '.',
 'Haue',
 'you',
 'had',
 'quiet',
 'Guard',
 '?',
 'Fran',
 '.',
 'Not',
 'a',
 'Mouse',
 'stirring',
 'Barn',
 '.',
 'Well',
 ',',
 'goodnight',
 '.',
 'If',
 'yo

In [66]:
gutenberg_tagged

[('[', 'IN'),
 ('The', 'DT'),
 ('Tragedie', 'NNP'),
 ('of', 'IN'),
 ('Hamlet', 'NNP'),
 ('by', 'IN'),
 ('William', 'NNP'),
 ('Shakespeare', 'NNP'),
 ('1599', 'CD'),
 (']', 'NNP'),
 ('Actus', 'NNP'),
 ('Primus', 'NNP'),
 ('.', '.'),
 ('Scoena', 'NNP'),
 ('Prima', 'NNP'),
 ('.', '.'),
 ('Enter', 'NNP'),
 ('Barnardo', 'NNP'),
 ('and', 'CC'),
 ('Francisco', 'NNP'),
 ('two', 'CD'),
 ('Centinels', 'NNP'),
 ('.', '.'),
 ('Barnardo', 'NNP'),
 ('.', '.'),
 ('Who', 'WP'),
 ("'s", 'VBZ'),
 ('there', 'RB'),
 ('?', '.'),
 ('Fran', 'NNP'),
 ('.', '.'),
 ('Nay', 'NNP'),
 ('answer', 'IN'),
 ('me', 'PRP'),
 (':', ':'),
 ('Stand', 'NNP'),
 ('&', 'CC'),
 ('vnfold', 'VB'),
 ('your', 'PRP$'),
 ('selfe', 'JJ'),
 ('Bar', 'NNP'),
 ('.', '.'),
 ('Long', 'NNP'),
 ('liue', 'VBD'),
 ('the', 'DT'),
 ('King', 'NNP'),
 ('Fran', 'NNP'),
 ('.', '.'),
 ('Barnardo', 'NNP'),
 ('?', '.'),
 ('Bar', 'NNP'),
 ('.', '.'),
 ('He', 'PRP'),
 ('Fran', 'NNP'),
 ('.', '.'),
 ('You', 'PRP'),
 ('come', 'VBP'),
 ('most', 'RBS'),
 ('ca

### Lemmatization

In [67]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mobis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### tokenize한 것을 lemmatize하세요

In [68]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

gutenberg_lemma = []
for token in gutenberg_tokens : 
    gutenberg_lemma.append(lemma.lemmatize(token))
    
gutenberg_lemma

['[',
 'The',
 'Tragedie',
 'of',
 'Hamlet',
 'by',
 'William',
 'Shakespeare',
 '1599',
 ']',
 'Actus',
 'Primus',
 '.',
 'Scoena',
 'Prima',
 '.',
 'Enter',
 'Barnardo',
 'and',
 'Francisco',
 'two',
 'Centinels',
 '.',
 'Barnardo',
 '.',
 'Who',
 "'s",
 'there',
 '?',
 'Fran',
 '.',
 'Nay',
 'answer',
 'me',
 ':',
 'Stand',
 '&',
 'vnfold',
 'your',
 'selfe',
 'Bar',
 '.',
 'Long',
 'liue',
 'the',
 'King',
 'Fran',
 '.',
 'Barnardo',
 '?',
 'Bar',
 '.',
 'He',
 'Fran',
 '.',
 'You',
 'come',
 'most',
 'carefully',
 'vpon',
 'your',
 'houre',
 'Bar',
 '.',
 "'T",
 'is',
 'now',
 'strook',
 'twelue',
 ',',
 'get',
 'thee',
 'to',
 'bed',
 'Francisco',
 'Fran',
 '.',
 'For',
 'this',
 'releefe',
 'much',
 'thankes',
 ':',
 "'T",
 'is',
 'bitter',
 'cold',
 ',',
 'And',
 'I',
 'am',
 'sicke',
 'at',
 'heart',
 'Barn',
 '.',
 'Haue',
 'you',
 'had',
 'quiet',
 'Guard',
 '?',
 'Fran',
 '.',
 'Not',
 'a',
 'Mouse',
 'stirring',
 'Barn',
 '.',
 'Well',
 ',',
 'goodnight',
 '.',
 'If',
 'yo

#### lemmatize한 것을 품사 tagging하세요

In [69]:
gutenberg_lemma_tagged = nltk.pos_tag(gutenberg_lemma)
gutenberg_lemma_tagged

[('[', 'IN'),
 ('The', 'DT'),
 ('Tragedie', 'NNP'),
 ('of', 'IN'),
 ('Hamlet', 'NNP'),
 ('by', 'IN'),
 ('William', 'NNP'),
 ('Shakespeare', 'NNP'),
 ('1599', 'CD'),
 (']', 'NNP'),
 ('Actus', 'NNP'),
 ('Primus', 'NNP'),
 ('.', '.'),
 ('Scoena', 'NNP'),
 ('Prima', 'NNP'),
 ('.', '.'),
 ('Enter', 'NNP'),
 ('Barnardo', 'NNP'),
 ('and', 'CC'),
 ('Francisco', 'NNP'),
 ('two', 'CD'),
 ('Centinels', 'NNP'),
 ('.', '.'),
 ('Barnardo', 'NNP'),
 ('.', '.'),
 ('Who', 'WP'),
 ("'s", 'VBZ'),
 ('there', 'RB'),
 ('?', '.'),
 ('Fran', 'NNP'),
 ('.', '.'),
 ('Nay', 'NNP'),
 ('answer', 'IN'),
 ('me', 'PRP'),
 (':', ':'),
 ('Stand', 'NNP'),
 ('&', 'CC'),
 ('vnfold', 'VB'),
 ('your', 'PRP$'),
 ('selfe', 'JJ'),
 ('Bar', 'NNP'),
 ('.', '.'),
 ('Long', 'NNP'),
 ('liue', 'VBD'),
 ('the', 'DT'),
 ('King', 'NNP'),
 ('Fran', 'NNP'),
 ('.', '.'),
 ('Barnardo', 'NNP'),
 ('?', '.'),
 ('Bar', 'NNP'),
 ('.', '.'),
 ('He', 'PRP'),
 ('Fran', 'NNP'),
 ('.', '.'),
 ('You', 'PRP'),
 ('come', 'VBP'),
 ('most', 'RBS'),
 ('ca

## 4. Removing Stopwords

#### stopwords를 제거하세요

In [70]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [71]:
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])
stop_words

{'!',
 '"',
 "'",
 '(',
 ')',
 ',',
 '.',
 ':',
 ';',
 '?',
 '[',
 ']',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or

In [72]:
filtered_words = []
for word in gutenberg_lemma : 
    if word not in stop_words : 
        filtered_words.append(word)
        
filtered_words 

['The',
 'Tragedie',
 'Hamlet',
 'William',
 'Shakespeare',
 '1599',
 'Actus',
 'Primus',
 'Scoena',
 'Prima',
 'Enter',
 'Barnardo',
 'Francisco',
 'two',
 'Centinels',
 'Barnardo',
 'Who',
 "'s",
 'Fran',
 'Nay',
 'answer',
 'Stand',
 '&',
 'vnfold',
 'selfe',
 'Bar',
 'Long',
 'liue',
 'King',
 'Fran',
 'Barnardo',
 'Bar',
 'He',
 'Fran',
 'You',
 'come',
 'carefully',
 'vpon',
 'houre',
 'Bar',
 "'T",
 'strook',
 'twelue',
 'get',
 'thee',
 'bed',
 'Francisco',
 'Fran',
 'For',
 'releefe',
 'much',
 'thankes',
 "'T",
 'bitter',
 'cold',
 'And',
 'I',
 'sicke',
 'heart',
 'Barn',
 'Haue',
 'quiet',
 'Guard',
 'Fran',
 'Not',
 'Mouse',
 'stirring',
 'Barn',
 'Well',
 'goodnight',
 'If',
 'meet',
 'Horatio',
 'Marcellus',
 'Riuals',
 'Watch',
 'bid',
 'make',
 'hast',
 'Enter',
 'Horatio',
 'Marcellus',
 'Fran',
 'I',
 'thinke',
 'I',
 'heare',
 'Stand',
 "'s",
 'Hor',
 'Friends',
 'ground',
 'Mar',
 'And',
 'Leige-men',
 'Dane',
 'Fran',
 'Giue',
 'good',
 'night',
 'Mar',
 'O',
 'fa

## 실습 1

* 단어별로 카운트를 하여 가장 많이 사용된 순서로 정렬
* (참고) https://docs.python.org/3/library/collections.html#collections.Counter.most_common

In [73]:
from collections import Counter

counts = Counter(filtered_words)
counts.most_common()

[('I', 550),
 ('Ham', 337),
 ('And', 257),
 ('Lord', 211),
 ("'d", 200),
 ('King', 172),
 ('haue', 158),
 ('The', 133),
 ('That', 120),
 ("'s", 119),
 ('To', 109),
 ('shall', 104),
 ('But', 103),
 ('Hamlet', 99),
 ('Hor', 95),
 ('What', 95),
 ('come', 92),
 ('thou', 90),
 ('thy', 83),
 ('Enter', 82),
 ('wa', 79),
 ('For', 76),
 ('good', 76),
 ('Oh', 76),
 ('like', 73),
 ('know', 68),
 ('My', 67),
 ('selfe', 65),
 ('let', 65),
 ('It', 65),
 ('v', 64),
 ('As', 63),
 ('A', 62),
 ('Qu', 62),
 ('doe', 61),
 ('Laer', 60),
 ('make', 59),
 ('well', 59),
 ('thee', 58),
 ('Ile', 58),
 ('may', 56),
 ('would', 56),
 ('Ophe', 56),
 ('How', 55),
 ('must', 53),
 ('hath', 49),
 ('Pol', 49),
 ('You', 48),
 ('speake', 48),
 ('go', 48),
 ('If', 47),
 ('Why', 47),
 ('man', 47),
 ('Sir', 47),
 ('say', 46),
 ('Father', 46),
 ('No', 46),
 ('loue', 45),
 ('time', 44),
 ('So', 44),
 ('This', 44),
 ('Rosin', 43),
 ('He', 42),
 ('much', 42),
 ('With', 42),
 ('one', 42),
 ('heere', 42),
 ('Let', 42),
 ('see', 42)

## 실습 2
* 동사의 유니크리스트를 만들어 보세요

In [74]:
gutenberg_lemma_verbs = set()
for word, tagger in gutenberg_lemma_tagged:
    if tagger in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        gutenberg_lemma_verbs.add(word)
        
gutenberg_lemma_verbs

{'expell',
 'sodaine',
 'hedge',
 'fight',
 'rent',
 'rotten',
 'swift',
 'Miching',
 'done',
 'Seeming',
 'lasting',
 'pester',
 'lay',
 'cracke',
 'euer',
 'crye',
 'assaid',
 'commanded',
 'giue',
 'Larded',
 'Comrade',
 'foreknowing',
 'astonish',
 'scape',
 'speaks',
 'heard',
 'whet',
 'depart',
 'resolue',
 'i',
 'filme',
 'outface',
 'am',
 'allowed',
 'broke',
 'shooke',
 'Folded',
 'fearing',
 'withdrew',
 'swallowed',
 'Ophe',
 'reforme',
 'peruse',
 'fretted',
 "'Tis",
 'watcht',
 'please',
 'padling',
 'casuall',
 'proue',
 'vnsmirched',
 'greeued',
 'shriuing',
 'reform',
 'changeling',
 'disclaiming',
 'wrinkled',
 'are',
 'poure',
 'chide',
 'iowles',
 "circumscrib'd",
 'divulging',
 'Seed',
 'staid',
 'bedded',
 'spill',
 'couched',
 'threaten',
 'witching',
 'writ',
 'suffred',
 'list',
 'sowing',
 'whine',
 'speakes',
 'wrote',
 'hees',
 'muddy',
 'intreate',
 'ended',
 'Making',
 'flaming',
 'husband',
 're-word',
 'sends',
 'Flourish',
 'vnderstand',
 'prouoke',
 '

# Topic Modeling
## LDA 실습

In [90]:
import pandas as pd

data = pd.read_csv('data/imdb_movie_review.csv', header=None)

In [91]:
data.head()

Unnamed: 0,0
0,"Here is one film that lived up to its hype, an..."
1,"To my surprise, it did not disappoint."
2,"Now, several years later after having watched ..."
3,It is a truly spectacular adventure story all ...
4,"The following two films were very good, to be ..."


In [92]:
data.columns = ['review']

In [93]:
data.head()

Unnamed: 0,review
0,"Here is one film that lived up to its hype, an..."
1,"To my surprise, it did not disappoint."
2,"Now, several years later after having watched ..."
3,It is a truly spectacular adventure story all ...
4,"The following two films were very good, to be ..."


In [94]:
len(data)

55652

### tokenization

In [95]:
for i in range(len(data)) : 
    data.review[i] = nltk.word_tokenize(data.review[i])
    
data.head()

Unnamed: 0,review
0,"[Here, is, one, film, that, lived, up, to, its..."
1,"[To, my, surprise, ,, it, did, not, disappoint..."
2,"[Now, ,, several, years, later, after, having,..."
3,"[It, is, a, truly, spectacular, adventure, sto..."
4,"[The, following, two, films, were, very, good,..."


### lemmatization

In [96]:
li = []
for i in range(len(data)) : 
    for token in data.review[i] : 
        li.append(lemma.lemmatize(token))
        
    data.review[i] = li
    li = []
    
data.head()

Unnamed: 0,review
0,"[Here, is, one, film, that, lived, up, to, it,..."
1,"[To, my, surprise, ,, it, did, not, disappoint..."
2,"[Now, ,, several, year, later, after, having, ..."
3,"[It, is, a, truly, spectacular, adventure, sto..."
4,"[The, following, two, film, were, very, good, ..."


### stopwords

In [97]:
stop = stopwords.words('english')

filtered_words = []

for i in range(len(data)) : 
    for word in data.review[i] : 
        if word not in stop : 
            filtered_words.append(word)
            
    data.review[i] = filtered_words
    filtered_words = []
    
data.head()

Unnamed: 0,review
0,"[Here, one, film, lived, hype, ,, time, I, saw..."
1,"[To, surprise, ,, disappoint, .]"
2,"[Now, ,, several, year, later, watched, three,..."
3,"[It, truly, spectacular, adventure, story, way..."
4,"[The, following, two, film, good, ,, sure, ,, ..."


In [99]:
processed_docs = data.review
processed_docs.head()

0    [Here, one, film, lived, hype, ,, time, I, saw...
1                     [To, surprise, ,, disappoint, .]
2    [Now, ,, several, year, later, watched, three,...
3    [It, truly, spectacular, adventure, story, way...
4    [The, following, two, film, good, ,, sure, ,, ...
Name: review, dtype: object

### BoW

In [101]:
import gensim
from gensim import corpora

In [106]:
dictionary = corpora.Dictionary(processed_docs[:10])
corpus = [dictionary.doc2bow(processed_doc) for processed_doc in processed_docs[0:10]]
print(dictionary)
print(corpus)

Dictionary(115 unique tokens: [',', '.', 'Here', 'I', 'arrived']...)
[[(0, 4), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)], [(0, 1), (1, 1), (22, 1), (23, 1), (24, 1)], [(0, 2), (1, 1), (3, 1), (7, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1)], [(0, 1), (1, 1), (7, 1), (29, 1), (30, 1), (36, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1)], [(0, 2), (1, 1), (7, 1), (30, 1), (51, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1)], [(0, 2), (1, 1), (30, 1), (32, 2), (43, 1), (54, 1), (57, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 2), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1)

### LDA

In [107]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word=dictionary)

In [108]:
results = pd.DataFrame(columns=["Topic", "Words"])

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))
    results.append({'Topic': idx, 'Words': topic}, ignore_index=True)

Topic: 0 
Words: 0.051*"." + 0.051*"generated" + 0.051*"beyond" + 0.051*"description" + 0.051*"care" + 0.051*"computer-" + 0.051*"," + 0.051*"n't" + 0.051*"I" + 0.051*"Some"
Topic: 1 
Words: 0.009*"," + 0.009*"." + 0.009*"I" + 0.009*"film" + 0.009*"n't" + 0.009*"disappoint" + 0.009*"''" + 0.009*"three" + 0.009*"To" + 0.009*"first"
Topic: 2 
Words: 0.072*"," + 0.045*"." + 0.044*"three" + 0.041*"film" + 0.041*"hour" + 0.034*"jaw-dropping" + 0.034*"I" + 0.034*"recall" + 0.034*"one" + 0.034*"many"
Topic: 3 
Words: 0.009*"," + 0.009*"." + 0.009*"film" + 0.009*"I" + 0.009*"three" + 0.009*"To" + 0.009*"first" + 0.009*"n't" + 0.009*"''" + 0.009*"disappoint"
Topic: 4 
Words: 0.051*"film" + 0.051*"." + 0.051*"," + 0.035*"``" + 0.035*"''" + 0.035*"three" + 0.018*"I" + 0.018*"episode" + 0.018*"Fellowship" + 0.018*"watch"
Topic: 5 
Words: 0.098*"," + 0.051*"Fellowship" + 0.051*"``" + 0.051*"stunned" + 0.051*"visuals" + 0.051*"Anyway" + 0.051*"alone" + 0.051*"''" + 0.051*"." + 0.005*"I"
Topic: 6 
Wo

## Word2Vec

In [32]:
import nltk
import pandas as pd

In [33]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\mobis\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [36]:
from nltk.corpus import gutenberg

sentences = [list(s) for s in gutenberg.sents()]

In [48]:
sentences[100]

['You',
 'do',
 'not',
 'think',
 'I',
 'could',
 'mean',
 '_you_',
 ',',
 'or',
 'suppose',
 'Mr',
 '.',
 'Knightley',
 'to',
 'mean',
 '_you_',
 '.']

In [43]:
from gensim.models.word2vec import Word2Vec

In [44]:
model = Word2Vec(sentences)

In [45]:
model.wv.similarity('Jane','Emma')

0.72407174

In [49]:
model.wv.similarity('he','she')

0.592259

In [51]:
model.wv.most_similar("Jane")

[('Harriet', 0.864632785320282),
 ('Miss', 0.8424371480941772),
 ('Colonel', 0.8113393187522888),
 ('Charles', 0.7796239256858826),
 ('Lady', 0.7731125950813293),
 ('Edward', 0.7527492046356201),
 ('Louisa', 0.7490370273590088),
 ('Mary', 0.7317168116569519),
 ('Captain', 0.7260234355926514),
 ('Emma', 0.724071741104126)]

In [53]:
model.wv.most_similar(positive=['she','Emma'], negative='Jane' ,topn=1)

[('Anne', 0.47754430770874023)]

## Word Cloud