In [3]:
import csv
import re
import glob
import math
import pickle
import json
from nltk.corpus import stopwords
import nltk
import zlib
from pprint import pprint
from collections import Counter

In [173]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [145]:
class TrackedWord:
    def __init__(self, word):
        self.word = word
        self.nearby = {}
        self.count = 1
        self.weight = -1
    def __iadd__(self, other):
        self.count += other
        return self
    def addNear(self, N, word1):
        if N not in self.nearby:
            self.nearby[N] = Counter()
        if word1 != '':
            self.nearby[N][word1] += 1
    def near(self, N):
        '''Returns the words which are N away'''
        if N in self.nearby:
            return self.nearby[N]
        else:
            return Counter()
    def sumNear(self, N):
        '''Returns the sum of all words within the range 1-N'''
        if N in self.nearby:
            relevant = []
            for i in range(1, N+1):
                relevant.append(self.nearby[i])
            return reduce((lambda x, y: x + y), relevant)
    def getWeight(self):
        '''Returns tf-idf of the word'''
        if self.weight < 0:
            df = float(WordTrackModel.df[self.word])
            n = float(WordTrackModel.totalworks)
            self.weight = float(self.count) * math.log(n/df)
        return self.weight

In [158]:
class WordTrackModel:
    
    stops = set(stopwords.words("english")[:])
    english = set(json.load(open('data/english.json', 'r')))
    totalworks = 0
    df = Counter()
    
    def __init__(self, name, f, data={}):
        self.name = name
        self.f = f
        self.text = self.loadfile(f)
        self.words = {}
        self.tracked = set([0])
        self.data = data
        WordTrackModel.totalworks += 1
    
    def loadfile(self, f):
        lines = []
        text = []
        with open(f, mode='r') as infile:
            lines = infile.readlines()
        started = False
        ended = False
        for line in lines:
            if not started:
                if '*** START' in line or '***START' in line:
                    started = True
                continue
            if '*** END' in line or '***END' in line:
                break
            line = line.strip('\n')
            line = unicode(line, "ascii", errors="ignore")
            line =  re.sub("[^a-zA-Z]", " ", line)
            line = line.lower()
            text.extend(line.split())
        text = " ".join([w for w in text if w not in self.stops and w in self.english])
        self.df.update(set(text.split()))
        return zlib.compress(text)
    
    def wordlist(self):
        return zlib.decompress(self.text).split()
    
    def distTrack(self, N):
        words = self.wordlist()
        if N in self.tracked:
            return self
        for i, word in enumerate(words):
            if word not in self.words:
                self.words[word] = TrackedWord(word)
            for j in range(N, 0, -1):
                if j in self.tracked:
                    break
                wNback = words[i-j] if i-j >= 0 else ''
                wNfor = words[i+j] if i+j < len(words) else ''
                self.words[word].addNear(j, wNback)
                self.words[word].addNear(j, wNfor)
                self.words[word] += 1
        for i in range(N, 0, -1):
            if i not in self.tracked:
                self.tracked.add(i)
            else:
                break
        return self
                
    def near(self, word, N, most=0):
        if N not in self.tracked:
            self.distTrack(N)
        if word in self.words:
            return self.words[word].near(N) if most <= 0 else self.words[word].near(N).most_common(most)
        else:
            return Counter() if most <= 0 else []
        
    def orderby(self, words, method, reverse=True):
        if method == 'frequency':
            return words.most_common()
        if method == 'importance':
            sortedwords = sorted([ (self.words[w].getWeight(), w) for w in words], reverse=reverse)
            return [(w[1], w[0]) for w in sortedwords]
        
    def sumNear(self, word, N, most=0, orderby='frequency'):
        if N not in self.tracked:
            self.distTrack(N)
        if word in self.words:
            nearby = self.words[word].sumNear(N)
            nearby = self.orderby(nearby, orderby)
            return nearby if most == 0 else nearby[:most]
        else:
            return []
    def most_common(self, N=-1):
        if N > 0: return Counter(self.wordlist()).most_common(N)
        return Counter(self.wordlist()).most_common()
    def most_important(self, maxnum=-1):
        words = set()
        for word in self.words:
            words.add((self.words[word].getWeight(), word))
        if maxnum > 0: return sorted(words, reverse=True)[:maxnum]
        return sorted(words, reverse=True)

In [159]:
def getGutenbergMeta(f):
    data = {'title': 'xxx', 'author': 'xxx'}
    with open(f) as infile:
        for line in infile.readlines():
            if line.startswith('Title: '):
                data['title'] = unicode(line[len('Title: '):].strip('\n'), "ascii", errors="ignore")
            if line.startswith('Author: '):
                data['author'] = unicode(line[len('Author: '):].strip('\n'), "ascii", errors="ignore")
    return data

In [160]:
models = []
WordTrackModel.df = Counter() # This is to reset it for notebook's sake
WordTrackModel.totalworks = 0 # This is to reset it for notebook's sake
for i, f in enumerate(glob.glob('data/*.txt')):
    data = getGutenbergMeta(f)
    print '['+str(i)+'] Currently processing ', data, '...'
    models.append(WordTrackModel(data['title'], f, data))

[0] Currently processing  {'author': u'James Joyce', 'title': u'Ulysses'} ...
[1] Currently processing  {'author': u'Oscar Wilde', 'title': u'The Importance of Being Earnest'} ...
[2] Currently processing  {'author': u'James M. Barrie', 'title': u'Peter Pan'} ...
[3] Currently processing  {'author': u'Rudyard Kipling', 'title': u'The Jungle Book'} ...
[4] Currently processing  {'author': u'Arthur Conan Doyle', 'title': u'The Return of Sherlock Holmes'} ...
[5] Currently processing  {'author': u'H. G. (Herbert George) Wells', 'title': u'The Time Machine'} ...
[6] Currently processing  {'author': u'Lucy Maud Montgomery', 'title': u'Anne of Green Gables'} ...
[7] Currently processing  {'author': u'L. Frank Baum', 'title': u'The Wonderful Wizard of Oz'} ...
[8] Currently processing  {'author': u'Mark Twain (Samuel Clemens)', 'title': u'The Adventures of Tom Sawyer, Complete'} ...
[9] Currently processing  {'author': u'Charles Dickens', 'title': u'A Christmas Carol'} ...
[10] Currently proc

In [161]:
for i, model in enumerate(models):
    print '['+str(i)+'] Currently processing ', model.data, '...'
    model.distTrack(3)

[0] Currently processing  {'author': u'James Joyce', 'title': u'Ulysses'} ...
[1] Currently processing  {'author': u'Oscar Wilde', 'title': u'The Importance of Being Earnest'} ...
[2] Currently processing  {'author': u'James M. Barrie', 'title': u'Peter Pan'} ...
[3] Currently processing  {'author': u'Rudyard Kipling', 'title': u'The Jungle Book'} ...
[4] Currently processing  {'author': u'Arthur Conan Doyle', 'title': u'The Return of Sherlock Holmes'} ...
[5] Currently processing  {'author': u'H. G. (Herbert George) Wells', 'title': u'The Time Machine'} ...
[6] Currently processing  {'author': u'Lucy Maud Montgomery', 'title': u'Anne of Green Gables'} ...
[7] Currently processing  {'author': u'L. Frank Baum', 'title': u'The Wonderful Wizard of Oz'} ...
[8] Currently processing  {'author': u'Mark Twain (Samuel Clemens)', 'title': u'The Adventures of Tom Sawyer, Complete'} ...
[9] Currently processing  {'author': u'Charles Dickens', 'title': u'A Christmas Carol'} ...
[10] Currently proc

In [215]:
def similar_clusters(search, near, models, orderby, limit, exclusive=False):
    clusters = {}
    similarclusters = []
    for model in models:
        clusters[model.name] = set([w[0] for w in model.sumNear(search, near, limit, orderby=orderby)])
        similarclusters = []
        for cluster1 in clusters:
            for cluster2 in clusters:
                if cluster1 != cluster2:
                    if len(clusters[cluster1] & clusters[cluster2]) > 2:
                        newclust = ((cluster1, clusters[cluster1]), (cluster2, clusters[cluster2]))
                        if (newclust[1], newclust[0]) not in similarclusters:
                            similarclusters.append(newclust)
    if not exclusive:
        return similarclusters
    else:
        return [((cluster[0][0], cluster[1][0]), cluster[0][1] & cluster[1][1] )for cluster in similarclusters]

In [219]:
similar = similar_clusters('family', 2, models, 'importance', 20)
for cluster in similar:
    pprint([w[0] for w in cluster])
    print ''

[u'Persuasion', u'Mansfield Park']

[u'Persuasion', u'Emma']

[u'Persuasion', u'Sense and Sensibility']

[u'Persuasion', u'Pride and Prejudice']

[u'Mansfield Park', u'Emma']

[u'Mansfield Park', u'Sense and Sensibility']

[u'Mansfield Park', u'Pride and Prejudice']

[u'Emma', u'Sense and Sensibility']

[u'Emma', u'Pride and Prejudice']

[u'Sense and Sensibility', u'Pride and Prejudice']



In [221]:
similar = similar_clusters('family', 2, models, 'importance', 20, True)
for cluster in similar:
    pprint(cluster)
    print ''

((u'Persuasion', u'Mansfield Park'),
 set(['connexion', 'lady', 'miss', 'party', 'sister']))

((u'Persuasion', u'Emma'),
 set(['colonel', 'connexion', 'lady', 'miss', 'party']))

((u'Persuasion', u'Sense and Sensibility'),
 set(['colonel', 'exceedingly', 'lady', 'miss', 'sister']))

((u'Persuasion', u'Pride and Prejudice'),
 set(['cousin', 'exceedingly', 'lady', 'marry', 'miss', 'sister']))

((u'Mansfield Park', u'Emma'),
 set(['affection', 'connexion', 'lady', 'miss', 'party']))

((u'Mansfield Park', u'Sense and Sensibility'),
 set(['affection', 'lady', 'miss', 'park', 'sister']))

((u'Mansfield Park', u'Pride and Prejudice'),
 set(['affection', 'inclination', 'lady', 'letter', 'miss', 'park', 'sister']))

((u'Emma', u'Sense and Sensibility'),
 set(['affection', 'colonel', 'lady', 'miss']))

((u'Emma', u'Pride and Prejudice'),
 set(['affection', 'lady', 'miss', 'regard']))

((u'Sense and Sensibility', u'Pride and Prejudice'),
 set(['affection', 'exceedingly', 'lady', 'miss', 'park', '

# Great Result

My clustering algorithm is actually able to detect the *same author's works* when ordering by importance. Fantastic.