In [3]:
import csv
import re
import glob
import math
import pickle
import json
from nltk.corpus import stopwords
import nltk
import zlib
from pprint import pprint
from collections import Counter

In [4]:
class TrackedWord:
    def __init__(self, word):
        self.word = word
        self.nearby = {}
        self.count = 1
    def __iadd__(self, other):
        self.count += other
        return self
    def addNear(self, N, word1):
        if N not in self.nearby:
            self.nearby[N] = Counter()
        if word1 != '':
            self.nearby[N][word1] += 1
    def near(self, N):
        '''Returns the words which are N away'''
        if N in self.nearby:
            return self.nearby[N]
        else:
            return Counter()
    def sumNear(self, N):
        '''Returns the sum of all words within the range 1-N'''
        if N in self.nearby:
            relevant = []
            for i in range(1, N+1):
                relevant.append(self.nearby[i])
            return reduce((lambda x, y: x + y), relevant)

In [5]:
class WordTrackModel:
    
    stops = set(stopwords.words("english")[:])
    english = set(json.load(open('data/english.json', 'r')))
    df = Counter()
    
    def __init__(self, name, f, data={}):
        self.name = name
        self.f = f
        self.text = self.loadfile(f)
        self.words = {}
        self.tracked = set([0])
        self.data = data
    
    def loadfile(self, f):
        lines = []
        text = []
        with open(f, mode='r') as infile:
            lines = infile.readlines()
        started = False
        ended = False
        for line in lines:
            if not started:
                if '*** START' in line or '***START' in line:
                    started = True
                continue
            if '*** END' in line or '***END' in line:
                break
            line = line.strip('\n')
            unicode(line, "ascii", errors="ignore")
            line =  re.sub("[^a-zA-Z]", " ", line)
            line = line.lower()
            text.extend(line.split())
        text = " ".join([w for w in text if w not in self.stops and w in self.english])
        self.df.update(set(text.split()))
        return zlib.compress(text)
    
    def wordlist(self):
        return zlib.decompress(self.text).split()
    
    def distTrack(self, N):
        words = self.wordlist()
        if N in self.tracked:
            return self
        for i, word in enumerate(words):
            if word not in self.words:
                self.words[word] = TrackedWord(word)
            for j in range(N, 0, -1):
                if j in self.tracked:
                    break
                wNback = words[i-j] if i-j >= 0 else ''
                wNfor = words[i+j] if i+j < len(words) else ''
                self.words[word].addNear(j, wNback)
                self.words[word].addNear(j, wNfor)
                self.words[word] += 1
        for i in range(N, 0, -1):
            if i not in self.tracked:
                self.tracked.add(i)
            else:
                break
        return self
                
    def near(self, word, N, most=0):
        if N not in self.tracked:
            self.distTrack(N)
        if word in self.words:
            return self.words[word].near(N) if most <= 0 else self.words[word].near(N).most_common(most)
        else:
            return Counter() if most <= 0 else []
        
    def sumNear(self, word, N, most=0):
        if N not in self.tracked:
            self.distTrack(N)
        if word in self.words:
            return self.words[word].sumNear(N) if most <= 0 else self.words[word].sumNear(N).most_common(most)
        else:
            return Counter() if most <= 0 else []
    def most_common(self, N=-1):
        if N > 0: return Counter(self.wordlist()).most_common(N)
        return Counter(self.wordlist()).most_common()

In [6]:
def getGutenbergMeta(f):
    data = {'title': 'xxx', 'author': 'xxx'}
    with open(f) as infile:
        for line in infile.readlines():
            if line.startswith('Title: '):
                data['title'] = unicode(line[len('Title: '):].strip('\n'), "ascii", errors="ignore")
            if line.startswith('Author: '):
                data['author'] = unicode(line[len('Author: '):].strip('\n'), "ascii", errors="ignore")
    return data

In [7]:
models = []
WordTrackModel.df = Counter() # This is to reset it for notebook's sake
for i, f in enumerate(glob.glob('data/*.txt')):
    data = getGutenbergMeta(f)
    print '['+str(i)+'] Currently processing ', data, '...'
    models.append(WordTrackModel(data['title'], f, data))

[0] Currently processing  {'author': u'James Joyce', 'title': u'Ulysses'} ...
[1] Currently processing  {'author': u'Oscar Wilde', 'title': u'The Importance of Being Earnest'} ...
[2] Currently processing  {'author': u'James M. Barrie', 'title': u'Peter Pan'} ...
[3] Currently processing  {'author': u'Rudyard Kipling', 'title': u'The Jungle Book'} ...
[4] Currently processing  {'author': u'Arthur Conan Doyle', 'title': u'The Return of Sherlock Holmes'} ...
[5] Currently processing  {'author': u'H. G. (Herbert George) Wells', 'title': u'The Time Machine'} ...
[6] Currently processing  {'author': u'Lucy Maud Montgomery', 'title': u'Anne of Green Gables'} ...
[7] Currently processing  {'author': u'L. Frank Baum', 'title': u'The Wonderful Wizard of Oz'} ...
[8] Currently processing  {'author': u'Mark Twain (Samuel Clemens)', 'title': u'The Adventures of Tom Sawyer, Complete'} ...
[9] Currently processing  {'author': u'Charles Dickens', 'title': u'A Christmas Carol'} ...
[10] Currently proc

In [8]:
for i, model in enumerate(models):
    print '['+str(i)+'] Currently processing ', model.data, '...'
    model.distTrack(3)

[0] Currently processing  {'author': u'James Joyce', 'title': u'Ulysses'} ...
[1] Currently processing  {'author': u'Oscar Wilde', 'title': u'The Importance of Being Earnest'} ...
[2] Currently processing  {'author': u'James M. Barrie', 'title': u'Peter Pan'} ...
[3] Currently processing  {'author': u'Rudyard Kipling', 'title': u'The Jungle Book'} ...
[4] Currently processing  {'author': u'Arthur Conan Doyle', 'title': u'The Return of Sherlock Holmes'} ...
[5] Currently processing  {'author': u'H. G. (Herbert George) Wells', 'title': u'The Time Machine'} ...
[6] Currently processing  {'author': u'Lucy Maud Montgomery', 'title': u'Anne of Green Gables'} ...
[7] Currently processing  {'author': u'L. Frank Baum', 'title': u'The Wonderful Wizard of Oz'} ...
[8] Currently processing  {'author': u'Mark Twain (Samuel Clemens)', 'title': u'The Adventures of Tom Sawyer, Complete'} ...
[9] Currently processing  {'author': u'Charles Dickens', 'title': u'A Christmas Carol'} ...
[10] Currently proc

In [9]:
clusters = {}
search = 'blood'
near = 2
maxnum = 10
for model in models:
    clusters[model.name] = model.near(search, near, maxnum)

In [10]:
counter = Counter()
clusters = {}
search = 'blood'
near = 2
maxnum = 10
for model in models:
    counter.update([w[0] for w in model.near(search, near, maxnum)])
    clusters[model.name] = model.near(search, near, maxnum)

In [11]:
def tf_idf(tf, df, n):
    return float(tf) * math.log(float(n)/df)

In [12]:
for model in models:
    pprint(model.name)
    pprint([w for w in sorted([(tf_idf(model.words[word].count, WordTrackModel.df[word], 40), word) for word in model.words], reverse=True)][:10])

u'Ulysses'
[(1794.1088392676168, 'bloom'),
 (610.0757953732937, 'j'),
 (561.693831439501, 'buck'),
 (554.317173405407, 'molly'),
 (546.5463719090694, 'lynch'),
 (534.8875208465207, 'corny'),
 (531.8854827020131, 'joe'),
 (482.43043766972187, 'ben'),
 (446.3867842806048, 'martin'),
 (425.3939828446667, 'paddy')]
u'The Importance of Being Earnest'
[(775.8946588504837, 'prism'),
 (569.1891319752583, 'chasuble'),
 (468.58863626681267, 'ernest'),
 (443.39664142887995, 'jack'),
 (78.10298841486718, 'miss'),
 (67.93291253006993, 'lady'),
 (62.46352283340588, 'lane'),
 (59.61065752815529, 'diary'),
 (51.29289136143595, 'cigarette'),
 (47.76641470867593, 'mamma')]
u'Peter Pan'
[(1189.7458839032238, 'peter'),
 (420.0150482105479, 'hook'),
 (382.22912543701165, 'tink'),
 (368.8879454113936, 'lagoon'),
 (172.6379186246152, 'tinker'),
 (170.60041871801462, 'pirate'),
 (166.85128838223616, 'darling'),
 (144.18111885132697, 'crocodile'),
 (128.63344965098494, 'john'),
 (114.35526307753203, 'liza')]
u

In [13]:
for title in clusters:
    print title
    for word in clusters[title]:
        print '\n\t', word

The Life and Adventures of Robinson Crusoe

	('shedding', 2)

	('flesh', 2)

	('chill', 2)

	('help', 1)

	('pulse', 1)

	('deliverance', 1)

	('human', 1)

	('go', 1)

	('beseeching', 1)

	('hole', 1)
The Iliad of Homer

	('stood', 5)

	('thus', 5)

	('thy', 5)

	('still', 4)

	('arms', 4)

	('hand', 4)

	('insatiate', 3)

	('ground', 3)

	('hector', 3)

	('good', 3)
The Adventures of Tom Sawyer, Complete

	('said', 3)

	('wart', 2)

	('blood', 2)

	('piece', 2)

	('summer', 1)

	('stick', 1)

	('win', 1)

	('church', 1)

	('potter', 1)

	('deep', 1)
The Importance of Being Earnest
Oliver Twist

	('upon', 3)

	('back', 2)

	('say', 2)

	('cold', 2)

	('lip', 2)

	('heart', 1)

	('walking', 1)

	('birth', 1)

	('burning', 1)

	('less', 1)
Adventures of Huckleberry Finn, Complete

	('picture', 1)

	('little', 1)

	('made', 1)

	('thrilling', 1)

	('name', 1)

	('pin', 1)

	('polly', 1)

	('journal', 1)

	('neck', 1)

	('n', 1)
The Return of Sherlock Holmes

	('said', 3)

	('lady', 2)

	

In [26]:
json.load(open('data/dictionary.json', 'r'))['houyhnhnm'.upper()]

u'One of the race of horses described by Swift in his imaginarytravels of Lemuel Gulliver. The Houyhnhnms were endowed with reasonand noble qualities; subject to them were Yahoos, a race of bruteshaving the form and all the worst vices of men.'

In [29]:
with open('data/'json.dumps([w for w in WordTrackModel.stops])

'["all", "just", "being", "over", "both", "through", "yourselves", "its", "before", "o", "hadn", "herself", "ll", "had", "should", "to", "only", "won", "under", "ours", "has", "do", "them", "his", "very", "they", "not", "during", "now", "him", "nor", "d", "did", "didn", "this", "she", "each", "further", "where", "few", "because", "doing", "some", "hasn", "are", "our", "ourselves", "out", "what", "for", "while", "re", "does", "above", "between", "mustn", "t", "be", "we", "who", "were", "here", "shouldn", "hers", "by", "on", "about", "couldn", "of", "against", "s", "isn", "or", "own", "into", "yourself", "down", "mightn", "wasn", "your", "from", "her", "their", "aren", "there", "been", "whom", "too", "wouldn", "themselves", "weren", "was", "until", "more", "himself", "that", "but", "don", "with", "than", "those", "he", "me", "myself", "ma", "these", "up", "will", "below", "ain", "can", "theirs", "my", "and", "ve", "then", "is", "am", "it", "doesn", "an", "as", "itself", "at", "have", "in