In [284]:
import csv
import re
import glob
import math
import pickle
from stops import stopwords
from collections import Counter

In [99]:
class TrackedWord:
    def __init__(self, word):
        self.word = word
        self.nearby = {}
        self.count = 1
    def __iadd__(self, other):
        self.count += other
        return self
    def addNear(self, N, word1):
        if N not in self.nearby:
            self.nearby[N] = Counter()
        if word1 != '':
            self.nearby[N][word1] += 1
    def near(self, N):
        '''Returns the words which are N away'''
        if N in self.nearby:
            return self.nearby[N]
        else:
            return Counter()
    def sumNear(self, N):
        '''Returns the sum of all words within the range 1-N'''
        if N in self.nearby:
            relevant = []
            for i in range(1, N+1):
                relevant.append(self.nearby[i])
            return reduce((lambda x, y: x + y), relevant)

In [307]:
class WordTrackModel:
    def __init__(self, name, f, data={}):
        self.name = name
        self.f = f
        self.text = self.loadfile(f)
        self.words = {}
        self.tracked = [0]
        self.data = data
    
    def loadfile(self, f):
        lines = []
        text = ''
        with open(f, mode='r') as infile:
            started = False
            ended = False
            for line in infile.readlines():
                if not started:
                    if '*** START' in line or '***START' in line:
                        started = True
                        continue
                    else:
                        continue
                if '*** END' in line or '***END' in line:
                    return text
                newline = ''
                for word in line.split():
                    if word not in stopwords:
                        newline += word + ' '
                text += self.cleanline(newline)
        return text
        
    def cleanline(self, line):
        line = line.strip('\n').strip(' ')
        line = re.sub('[.,?!#$\'\"\(\)\d\*_;]', '', line)
        line = line.replace('--', ' ')
        line = unicode(line, "ascii", errors="ignore")
        return line.lower()
    
    def distTrack(self, N):
        blocksplit = self.text.split()
        if N in self.tracked:
            return self
        for i, word in enumerate(blocksplit):
            if word not in self.words:
                self.words[word] = TrackedWord(word)
            for j in range(N, 0, -1):
                if j in self.tracked:
                    break
                wNback = blocksplit[i-j] if i-j >= 0 else ''
                wNfor = blocksplit[i+j] if i+j < len(blocksplit) else ''
                self.words[word].addNear(j, wNback)
                self.words[word].addNear(j, wNfor)
                self.words[word] += 1
        for i in range(N, 0, -1):
            if i not in self.tracked:
                self.tracked.append(i)
            else:
                break
        return self
                
    def getNear(self, word, N, most=0):
        if N not in self.tracked:
            self.distTrack(N)
        if word in self.words:
            return self.words[word].near(N) if most <= 0 else self.words[word].sumNear(N).most_common(most)
        else:
            return Counter() if most <= 0 else []
        
    def getSumNear(self, word, N, most=0):
        if N not in self.tracked:
            self.distTrack(N)
        if word in self.words:
            return self.words[word].sumNear(N) if most <= 0 else self.words[word].sumNear(N).most_common(most)
        else:
            return Counter() if most <= 0 else []
    def most_common(self, N=-1):
        sort = [(value.count, key) for key,value in self.words.items()]
        if N == -1:
            N = len(sort)
        sort = sorted(sort, reverse=True)[:N]
        sort = [(key, value) for value, key in sort]
        return sort
    def most_common(self, N=-1):
        sort = [(value.count, key) for key,value in self.words.items()]
        if N == -1:
            N = len(sort)
        sort = sorted(sort, reverse=False)[:N]
        sort = [(key, value) for value, key in sort]
        return sort

In [302]:
def getGutenbergMeta(f):
    data = {'title': 'xxx', 'author': 'xxx'}
    with open(f) as infile:
        for line in infile.readlines():
            if line.startswith('Title: '):
                data['title'] = unicode(line[len('Title: '):].strip('\n'), "ascii", errors="ignore")
            if line.startswith('Author: '):
                data['author'] = unicode(line[len('Author: '):].strip('\n'), "ascii", errors="ignore")
    return data

In [308]:
models = []

for i, f in enumerate(glob.glob('data/*.txt')):
    data = getGutenbergMeta(f)
    print '['+str(i)+'] Currently processing ', data, '...'
    models.append(WordTrackModel(data['title'], f, data))

[0] Currently processing  {'author': u'Charles Dickens', 'title': u'A Christmas Carol'} ...
[1] Currently processing  {'author': u'Lewis Carroll', 'title': u'Alices Adventures in Wonderland'} ...
[2] Currently processing  {'author': u'Lucy Maud Montgomery', 'title': u'Anne of Green Gables'} ...
[3] Currently processing  {'author': u'Friedrich Nietzsche', 'title': u'Beyond Good and Evil'} ...
[4] Currently processing  {'author': u'Oscar Wilde', 'title': u'The Picture of Dorian Gray'} ...
[5] Currently processing  {'author': u'Bram Stoker', 'title': u'Dracula'} ...
[6] Currently processing  {'author': u'Robert Louis Stevenson', 'title': u'Dr. Jekyll and Mr. Hyde'} ...
[7] Currently processing  {'author': u'James Joyce', 'title': u'Dubliners'} ...
[8] Currently processing  {'author': u'Jane Austen', 'title': u'Emma'} ...
[9] Currently processing  {'author': u'The Brothers Grimm', 'title': u'Grimms Fairy Tales'} ...
[10] Currently processing  {'author': u'Mary Wollstonecraft (Godwin) Shell

In [309]:
for i, model in enumerate(models):
    print '['+str(i)+']Currently processing ', model.data, '...'
    model.distTrack(3)

[0]Currently processing  {'author': u'Charles Dickens', 'title': u'A Christmas Carol'} ...
[1]Currently processing  {'author': u'Lewis Carroll', 'title': u'Alices Adventures in Wonderland'} ...
[2]Currently processing  {'author': u'Lucy Maud Montgomery', 'title': u'Anne of Green Gables'} ...
[3]Currently processing  {'author': u'Friedrich Nietzsche', 'title': u'Beyond Good and Evil'} ...
[4]Currently processing  {'author': u'Oscar Wilde', 'title': u'The Picture of Dorian Gray'} ...
[5]Currently processing  {'author': u'Bram Stoker', 'title': u'Dracula'} ...
[6]Currently processing  {'author': u'Robert Louis Stevenson', 'title': u'Dr. Jekyll and Mr. Hyde'} ...
[7]Currently processing  {'author': u'James Joyce', 'title': u'Dubliners'} ...
[8]Currently processing  {'author': u'Jane Austen', 'title': u'Emma'} ...
[9]Currently processing  {'author': u'The Brothers Grimm', 'title': u'Grimms Fairy Tales'} ...
[10]Currently processing  {'author': u'Mary Wollstonecraft (Godwin) Shelley', 'title

In [310]:
common = Counter()
for model in models:
    common.update([m[0] for m in model.most_common(100)])

In [311]:
[c for c in common.most_common()]

[(u'accounts', 13),
 (u'accidental', 12),
 (u'abide', 11),
 (u'abject', 11),
 (u'acceptance', 11),
 (u'accused', 11),
 (u'accomplish', 10),
 (u'accustomed', 10),
 (u'abode', 10),
 (u'accent', 10),
 (u'abstraction', 9),
 (u'abundant', 9),
 (u'accidentally', 9),
 (u'absorbing', 9),
 (u'accord', 9),
 (u'abandoned', 9),
 (u'abrupt', 8),
 (u'abused', 8),
 (u'actually', 8),
 (u'abandoning', 8),
 (u'abuse', 8),
 (u'accidents', 8),
 (u'abounding', 7),
 (u'active', 7),
 (u'aching', 7),
 (u'absent', 7),
 (u'according', 7),
 (u'accusation', 7),
 (u'accents', 7),
 (u'accosted', 7),
 (u'accompanying', 7),
 (u'abstracted', 7),
 (u'accept', 7),
 (u'abruptly', 7),
 (u'abroad', 7),
 (u'access', 7),
 (u'accompany', 7),
 (u'absurd', 7),
 (u'ability', 7),
 (u'abundance', 6),
 (u'ached', 6),
 (u'accident', 6),
 (u'advanced', 6),
 (u'advantage', 6),
 (u'aback', 6),
 (u'acknowledge', 6),
 (u'abhorred', 6),
 (u'accepted', 6),
 (u'abyss', 6),
 (u'abatement', 6),
 (u'abandon', 6),
 (u'acted', 6),
 (u'acquaintan