## 2.4 - Calculating Word Frequencies

#### Get the data

`$ python -m nltk.downloader moview_reviews`

In [1]:
from nltk.corpus import movie_reviews

In [5]:
movie_reviews.fileids()[0:10]

[u'neg/cv000_29416.txt',
 u'neg/cv001_19502.txt',
 u'neg/cv002_17424.txt',
 u'neg/cv003_12683.txt',
 u'neg/cv004_12641.txt',
 u'neg/cv005_29357.txt',
 u'neg/cv006_17022.txt',
 u'neg/cv007_4992.txt',
 u'neg/cv008_29326.txt',
 u'neg/cv009_29417.txt']

In [6]:
len(movie_reviews.fileids())

2000

In [10]:
type(movie_reviews.fileids())

list

In [11]:
len(movie_reviews.fileids('pos')), len(movie_reviews.fileids('neg'))

(1000, 1000)

In [12]:
pos_reviews = [movie_reviews.words(fileid)
               for fileid in movie_reviews.fileids('pos')]

neg_reviews = [movie_reviews.words(fileid)
               for fileid in movie_reviews.fileids('neg')]

#### Term frequencies

In [21]:
# Examples of Counter
from collections import Counter
# iterable in form of string: counts characters in the string
c = Counter('abcdefab')
print(c.most_common(3))

# iterable in form of list: counts instances of values in the list
c = Counter(['abc', 'def', 'abc', 'ghi', 'abc'])
print(c)

# the list can be heterogenous
c = Counter([1, 2, 3, 1, 4, 2, 3, 4, 'a'])
print(c)

[('a', 2), ('b', 2), ('c', 1)]
Counter({'abc': 3, 'ghi': 1, 'def': 1})
Counter({1: 2, 2: 2, 3: 2, 4: 2, 'a': 1})


In [22]:
one_review = pos_reviews[10]

from collections import Counter
c = Counter(one_review)

c.most_common(20)

[(u',', 45),
 (u'.', 44),
 (u'the', 44),
 (u'a', 39),
 (u'and', 20),
 (u'to', 17),
 (u'-', 17),
 (u'of', 17),
 (u'in', 16),
 (u'(', 15),
 (u')', 15),
 (u'"', 14),
 (u'is', 14),
 (u'for', 12),
 (u'that', 11),
 (u'with', 8),
 (u'his', 7),
 (u'it', 7),
 (u'i', 7),
 (u'he', 6)]

In [None]:
from nltk.corpus import stopwords
from string import punctuation

stop_list = stopwords.words('english') + list(punctuation)

one_review_no_stop = [word for word in one_review if word not in stop_list]

c = Counter(one_review_no_stop)

c.most_common(20)

#### Frequencies across the whole collection

In [None]:
from itertools import chain

all_positive = list(chain(*pos_reviews))
all_negative = list(chain(*neg_reviews))

all_positive

In [None]:
total_freq = Counter(all_positive)

total_freq.most_common(20)

In [None]:
all_positive_no_stop = [t for t in all_positive if t not in stop_list]
all_negative_no_stop = [t for t in all_negative if t not in stop_list]

total_freq_no_stop = Counter(all_positive_no_stop)

total_freq_no_stop.most_common(20)

In [None]:
from nltk import FreqDist

f = FreqDist(all_positive)

f.most_common(20)

In [None]:
%matplotlib inline

f.plot(30)

In [None]:
f = FreqDist(all_positive_no_stop)

f.most_common(20)

In [None]:
f.plot(30)

In [None]:
f = FreqDist(all_negative)

f.plot(30)

In [None]:
f = FreqDist(all_negative_no_stop)

f.plot(30)

#### Zipf's Law

https://en.wikipedia.org/wiki/Zipf%27s_law