In [1]:
import csv
import re
import glob
import math
import json
import zlib
from pprint import pprint
from collections import Counter

In [2]:
class TrackedWord:
    def __init__(self, word):
        self.word = word
        self.nearby = {}
        self.count = 1
        self.weight = -1
    def __iadd__(self, other):
        self.count += other
        return self
    def addNear(self, N, word1):
        if N not in self.nearby:
            self.nearby[N] = Counter()
        if word1 != '':
            self.nearby[N][word1] += 1
    def near(self, N):
        '''Returns the words which are N away'''
        if N in self.nearby:
            return self.nearby[N]
        else:
            return Counter()
    def sumNear(self, N):
        '''Returns the sum of all words within the range 1-N'''
        if N in self.nearby:
            relevant = []
            for i in range(1, N+1):
                relevant.append(self.nearby[i])
            return reduce((lambda x, y: x + y), relevant)
    def getWeight(self):
        '''Returns tf-idf of the word'''
        if self.weight < 0:
            df = float(WordTrackModel.df[self.word])
            n = float(WordTrackModel.totalworks)
            self.weight = float(self.count) * math.log(n/df)
        return self.weight

In [3]:
class WordTrackModel:
    
    stops = set(json.load(open('data/nltkstopwords.json', 'r')))
    english = set(json.load(open('data/english.json', 'r')))
    totalworks = 0
    df = Counter()
    
    def __init__(self, name, f, data={}):
        self.name = name
        self.f = f
        self.text = self.loadfile(f)
        self.words = {}
        self.tracked = set([0])
        self.data = data
        WordTrackModel.totalworks += 1
    
    def loadfile(self, f):
        lines = []
        text = []
        with open(f, mode='r') as infile:
            lines = infile.readlines()
        started = False
        ended = False
        for line in lines:
            if not started:
                if '*** START' in line or '***START' in line:
                    started = True
                continue
            if '*** END' in line or '***END' in line:
                break
            line = line.strip('\n')
            line = unicode(line, "ascii", errors="ignore")
            line =  re.sub("[^a-zA-Z]", " ", line)
            line = line.lower()
            text.extend(line.split())
        text = " ".join([w for w in text if w not in self.stops and w in self.english])
        self.df.update(set(text.split()))
        return zlib.compress(text)
    
    def wordlist(self):
        return zlib.decompress(self.text).split()
    
    def distTrack(self, N):
        words = self.wordlist()
        if N in self.tracked:
            return self
        for i, word in enumerate(words):
            if word not in self.words:
                self.words[word] = TrackedWord(word)
            for j in range(N, 0, -1):
                if j in self.tracked:
                    break
                wNback = words[i-j] if i-j >= 0 else ''
                wNfor = words[i+j] if i+j < len(words) else ''
                self.words[word].addNear(j, wNback)
                self.words[word].addNear(j, wNfor)
                self.words[word] += 1
        for i in range(N, 0, -1):
            if i not in self.tracked:
                self.tracked.add(i)
            else:
                break
        return self
                
    def near(self, word, N, most=0):
        if N not in self.tracked:
            self.distTrack(N)
        if word in self.words:
            return self.words[word].near(N) if most <= 0 else self.words[word].near(N).most_common(most)
        else:
            return Counter() if most <= 0 else []
        
    def orderby(self, words, method, reverse=True):
        if method == 'frequency':
            return words.most_common()
        if method == 'importance':
            sortedwords = sorted([ (self.words[w].getWeight(), w) for w in words], reverse=reverse)
            return [(w[1], w[0]) for w in sortedwords]
        
    def sumNear(self, word, N, most=0, orderby='frequency'):
        if N not in self.tracked:
            self.distTrack(N)
        if word in self.words:
            nearby = self.words[word].sumNear(N)
            nearby = self.orderby(nearby, orderby)
            return nearby if most == 0 else nearby[:most]
        else:
            return []
    def most_common(self, N=-1):
        if N > 0: return Counter(self.wordlist()).most_common(N)
        return Counter(self.wordlist()).most_common()
    def most_important(self, maxnum=-1):
        words = set()
        for word in self.words:
            words.add((self.words[word].getWeight(), word))
        if maxnum > 0: return sorted(words, reverse=True)[:maxnum]
        return sorted(words, reverse=True)

In [60]:
class KnowledgeBase:
    def __init__(self, debugging=False):
        self.debugging = debugging
        self.models = self.loadModels()
        
    def loadModels(self):
        models = {}
        WordTrackModel.df = Counter() # This is to reset it for notebook's sake
        WordTrackModel.totalworks = 0 # This is to reset it for notebook's sake
        for i, f in enumerate(glob.glob('data/*.txt')):
            data = self.getGutenbergMeta(f)
            if self.debugging:
                print '['+str(i)+'] Currently processing ', data, '...'
            models[data['title']] = WordTrackModel(data['title'], f, data)
            models[data['title']].distTrack(3)
        return models
    def getGutenbergMeta(self, f):
        data = {'title': 'xxx', 'author': 'xxx'}
        with open(f) as infile:
            for line in infile.readlines():
                if line.startswith('Title: '):
                    data['title'] = unicode(line[len('Title: '):].strip('\n'), "ascii", errors="ignore")
                if line.startswith('Author: '):
                    data['author'] = unicode(line[len('Author: '):].strip('\n'), "ascii", errors="ignore")
        return data
    def similar_clusters(self, search, dist_away=2, orderby='importance', limit=20, exclusive=True):
        models = self.models.values()
        near = dist_away
        clusters = {}
        similarclusters = []
        for model in models:
            clusters[model.name] = set([w[0] for w in model.sumNear(search, near, limit, orderby=orderby)])
            similarclusters = []
            for cluster1 in clusters:
                for cluster2 in clusters:
                    if cluster1 != cluster2:
                        if len(clusters[cluster1] & clusters[cluster2]) > 2:
                            newclust = ((cluster1, clusters[cluster1]), (cluster2, clusters[cluster2]))
                            if (newclust[1], newclust[0]) not in similarclusters:
                                similarclusters.append(newclust)
        if not exclusive:
            return similarclusters
        else:
            return [((cluster[0][0], cluster[1][0]), cluster[0][1] & cluster[1][1] )for cluster in similarclusters]
    
    def most_similar_to(self, search, limit=10):
        models = self.models
        if isinstance(search, str): 
            search = [search]
        similars = {}
        for model in models:
            similars[model] = Counter()
            numwords = float(len(models[model].wordlist()))
            for word in search:
                if word in models[model].words:
                    similars[model].update({word: math.log(1+models[model].words[word].count/numwords)})
        sortedsums = sorted([(sum(similars[w].values()), w, similars[w].items()) for w in similars], reverse=True)[:limit]
        return sortedsums

In [61]:
kb = KnowledgeBase()

In [66]:
search = ['time', 'hate']
kb.most_similar_to(search)

[(0.05837405822584296, u'The Time Machine', [('time', 0.05837405822584296)]),
 (0.028962589359808447,
  u'Adventures of Huckleberry Finn, Complete',
  [('time', 0.028962589359808447)]),
 (0.02703688379890662, u'Metamorphosis', [('time', 0.02703688379890662)]),
 (0.02676508270507471,
  u'Peter Pan',
  [('hate', 0.00028318584260041954), ('time', 0.02648189686247429)]),
 (0.026332848231276483,
  u'The Adventures of Tom Sawyer, Complete',
  [('hate', 0.0011118277367785808), ('time', 0.0252210204944979)]),
 (0.026168570882515667,
  u'Alices Adventures in Wonderland',
  [('hate', 0.000838775437109871), ('time', 0.025329795445405794)]),
 (0.025293901167972345,
  u'Dracula',
  [('hate', 0.0007220216920166442), ('time', 0.0245718794759557)]),
 (0.023817057069381187,
  u'The Life and Adventures of Robinson Crusoe',
  [('hate', 0.00011489300601462612), ('time', 0.02370216406336656)]),
 (0.0231792841243361, u'A Christmas Carol', [('time', 0.0231792841243361)]),
 (0.023068266607834537,
  u'Through 

In [69]:
kb.similar_clusters('warm', exclusive=False)

[((u'A Christmas Carol',
   {'bright',
    'city',
    'comforter',
    'dear',
    'fire',
    'heart',
    'indoors',
    'lord',
    'pavement',
    'pedestrian',
    'pulse',
    'snug',
    'thermometer',
    'tried',
    'upon',
    'warehouse',
    'warmth',
    'weather',
    'wintry',
    'ye'}),
  (u'The Works of Edgar Allan Poe',
   {'air',
    'body',
    'carefully',
    'city',
    'damp',
    'done',
    'especially',
    'examining',
    'grow',
    'quite',
    'said',
    'something',
    'spoke',
    'stirring',
    'subject',
    'upon',
    'water',
    'weather',
    'well'})),
 ((u'A Christmas Carol',
   {'bright',
    'city',
    'comforter',
    'dear',
    'fire',
    'heart',
    'indoors',
    'lord',
    'pavement',
    'pedestrian',
    'pulse',
    'snug',
    'thermometer',
    'tried',
    'upon',
    'warehouse',
    'warmth',
    'weather',
    'wintry',
    'ye'}),
  (u'Frankenstein',
   {'affection',
    'although',
    'chill',
    'cottage',
    '

## Continuing On

The results I'm getting now are definitely interesting and hopefully useable. 