In [2]:
import csv
import re
import glob
import math
import json
import zlib
from pprint import pprint
from collections import Counter

In [213]:
class TrackedWord:
    def __init__(self, word, parent):
        self.word = word
        self.nearby = {}
        self.count = 1
        self.weight = None
        self.parent = parent
    def __iadd__(self, other):
        self.count += other
        return self
    def addNearbyWord(self, word, N):
        """Add word to the nearby map as appearing N away."""
        if N not in self.nearby:
            self.nearby[N] = Counter()
        if word != '':
            self.nearby[N][word] += 1
    def getNearbyWords(self, N):
        """Return all words which appeared N away from self.
        
        Args:
            N: Integer representing far away a word must be to be considered nearby
            
        Returns:
            Dictionary of all words which appeared N away.
        """
        if N in self.nearby:
            return dict(self.nearby[N])
        else:
            return dict()
    def getNearbyWordsInRange(self, *args):
        """Return all words which which appear 1 to N words away from self.
        
        Args:
            args: Zero, One, or Two arguments representing the lower and upper
                  bounds of the range to be found. If only one argument is provided,
                  it will be an upper bound.
            
        Returns:
            A dictionary containing all found words and their number of occurrences.
        """
        lower = 1
        upper = 1
        if len(args) < 1:
            return dict()
        if len(args) == 1:
            upper = args[0]
        if len(args) == 2:
            lower = args[0]
            upper = args[1]
        cumulative = Counter()
        for i in range(lower, upper+1):
            cumulative += Counter(self.getNearbyWords(i))
        return dict(cumulative)
    def getWeight(self):
        """Return the TF-IDF value for a word. 
        
        The TF-IDF is ther term frequency/inverse document frequency of a word. 
        This represents the importance of the word relative to the work and the
        other works in the parent model.
        """
        if self.weight is None:
            df = float(self.parent.getNumberOfDocumentsContaining(self.word))
            n = float(self.parent.getNumberOfDocuments())
            self.weight = float(self.count) * math.log(n/df)
        return self.weight
    
    def getCount(self):
        return self.count

In [299]:
class TrackedWordModel:
    def __init__(self, text, parent, meta={}):
        self.text = zlib.compress(text)
        self.words = {}
        self.trackedDistances = set([0])
        self.meta = meta
        self.parent = parent

    def getText(self):
        return zlib.decompress(self.text)
    
    def setupTracking(self, N):
        """Update the word stracking model to track up to N words away if not already tracked.
        
        Args: 
            N: Inclusive maximum distance for words to track.
            
        Returns:
            Self, for chaining.
        """
        words = self.getText().split()
        if N in self.trackedDistances:
            return self
        for i, word in enumerate(words):
            if word not in self.words:
                self.words[word] = TrackedWord(word, self)
            for j in range(N, 0, -1):
                if j in self.trackedDistances:
                    break
                wNback = words[i-j] if i-j >= 0 else ''
                wNfor = words[i+j] if i+j < len(words) else ''
                self.words[word].addNearbyWord(wNback, j)
                self.words[word].addNearbyWord(wNfor, j)
                self.words[word] += 1
        for i in range(N, 0, -1):
            if i not in self.trackedDistances:
                self.trackedDistances.add(i)
            else:
                break
        return self
                
    def getWordsNear(self, word, N):
        """Return a dictionary of words which are exactly N words away from word
        
        Args:
            word: The string being searched for.
            N: The distance away from word being searched.
        Return:
            Dictionary containing all nearby words, or an empty dictionary if word not found.
        """
        if N not in self.trackedDistances:
            self.setupTracking(N)
        if word in self.words:
            return dict(self.words[word].getNearbyWords(N))
        else:
            return dict()
        
    def getNearbyWordsInRange(self, word, *args):
        """Return all words which which appear 1 to N words away from self.
        
        Args:
            args: Zero, One, or Two arguments representing the lower and upper
                  bounds of the range to be found. If only one argument is provided,
                  it will be an upper bound.
            word: The word being searched for.
            
        Returns:
            A dictionary containing all found words and their number of occurrences.
        """
        lower = 1
        upper = 1
        if len(args) < 1:
            return dict()
        if len(args) == 1:
            upper = args[0]
        if len(args) == 2:
            lower = args[0]
            upper = args[1]
            
        if upper not in self.trackedDistances:
            self.setupTracking(upper)
        if word in self.words:
            nearby = self.words[word].getNearbyWordsInRange(lower, upper)
            return nearby
        else:
            return dict()
        
    def getMostFrequent(self):
        words = set()
        if len(self.words) == 0:
            self.setupTracking(1)
        for word in self.words:
            words.add((self.words[word].getCount(), word))
        words = sorted(words, reverse=True)
        return words
    
    def getMostImportant(self):
        words = set()
        if len(self.words) == 0:
            self.setupTracking(1)
        for word in self.words:
            words.add((self.words[word].getWeight(), word))
        words = sorted(words, reverse=True)
        return words
    
    def getNumberOfDocuments(self):
        return self.parent.getNumberOfDocuments()
    
    def getNumberOfDocumentsContaining(self, word):
        return self.parent.getNumberOfDocumentsContaining(word)
    
    def getWord(self, word):
        if len(self.words) == 0:
            self.setupTracking(1)
        if word in self.words:
            return self.words[word]
        else:
            return False
        
    def getWords(self):
        if len(self.words) == 0:
            self.setupTracking(1)
        return self.words

In [300]:
class WordModelCollection:
    
    def __init__(self):
        self.models = {}
        self.documentFrequencyCounter = Counter()
    def updateModel(self, text, meta):
        '''Add or update a model in the collection.
        
        If the title isn't in the collection, add it and update the DF count
        If the title is in the collection but the text is identical, do nothing
        If the title is in the collection but the text isn't identical, remove
        the set of words from the counter and add the set of the new text
        '''
        title = meta['title'].lower()
        if title in self.models:
            if self.models[title].getText() == text:
                return False
            else:
                self.documentFrequencyCounter -= set(self.models[title].getText().split())
        self.models[title] = TrackedWordModel(text, self, meta)
        self.updateDocumentFrequency(text)
        return True
    
    def getModel(self, title):
        return self.models[title.lower()]
    def getModels(self):
        return self.models
    def getNumberOfDocuments(self):
        return len(self.models)
    def getNumberOfDocumentsContaining(self, word):
        return self.documentFrequencyCounter[word]
    def updateDocumentFrequency(self, text=''):
        if text:
            self.documentFrequencyCounter.update(set(text.split()))
        self.documentFrequencyCounter = Counter()
        for model in self.models.values():
            self.documentFrequencyCounter.update(set(model.getText().split()))

In [301]:
c1 = Counter("Bill is going to the playground".split())
c2 = Counter("Amy is also going to the playground".split())
c3 = Counter("Ryan doesn't like the playground".split())
c1+c2-c1+c3

Counter({'Amy': 1,
         'Ryan': 1,
         'also': 1,
         "doesn't": 1,
         'going': 1,
         'is': 1,
         'like': 1,
         'playground': 2,
         'the': 2,
         'to': 1})

In [305]:
def loadFile(f):
    lines = []
    text = []
    with open(f, mode='r') as infile:
        lines = infile.readlines()
    started = False
    ended = False
    for line in lines:
        if not started:
            if '*** START' in line or '***START' in line:
                started = True
            continue
        if '*** END' in line or '***END' in line:
            break
        line = line.strip('\n')
        line = unicode(line, "ascii", errors="ignore")
        line =  re.sub("[^a-zA-Z]", " ", line)
        line = line.lower()
        text.extend(line.split())
    text = " ".join([w for w in text if suitableWord(w)])
    return text

def suitableWord(word):
    if len(word) < 2:
        return False
    if word in stops:
        return False
    if word not in english:
        return False
    if word in warriner and warriner[word]['arousal'] < 3:
        return False
    return True

def getGutenbergMeta(f):
    data = {}
    with open(f) as infile:
        for line in infile.readlines():
            if line.startswith('Title: '):
                data['title'] = unicode(line[len('Title: '):].strip('\n'), "ascii", errors="ignore")
            if line.startswith('Author: '):
                data['author'] = unicode(line[len('Author: '):].strip('\n'), "ascii", errors="ignore")
    return data

def loadModels(maxmodels=-1, debugging=False, warriner=False):
    models = []
    files = glob.glob('data/*.txt')
    maxmodels = maxmodels if maxmodels >0 else len(files)
    for i, f in enumerate(files[:maxmodels]):
        meta = getGutenbergMeta(f)
        if debugging:
            print '['+str(i)+'] Currently processing ', meta['title'], '...'
        models.append({'text': loadFile(f), 'meta': meta})
    return models

stops = set(json.load(open('data/nltkstopwords.json', 'r')))
english = set(json.load(open('data/english.json', 'r')))
with open('models/warriner.csv', mode='r') as infile:
    reader = csv.reader(infile)
    next(reader)
    warriner = {rows[1]: {'valence': (float)(rows[2]), 'arousal': (float)(rows[5]), 'dominance': (float)(rows[8])} for rows in reader}
    

In [306]:
collect = WordModelCollection()
meta = getGutenbergMeta('data/dracula.txt')
text = loadFile('data/dracula.txt')
collect.updateModel(text, meta)
models = loadModels(maxmodels=5)
for model in models:
    collect.updateModel(model['text'], model['meta'])

False

In [48]:
class KnowledgeBase:
    def __init__(self, debugging=False, maxmodels=-1, warriner=False):
        self.debugging = debugging
        self.models = self.loadModels(maxmodels=maxmodels, warriner=warriner)
        
    def loadModels(self, maxmodels=-1, warriner=False):
        models = {}
        WordTrackModel.df = Counter()
        WordTrackModel.totalworks = 0
        files = glob.glob('data/*.txt')
        maxmodels = maxmodels if maxmodels >0 else len(files)
        for i, f in enumerate(files[:maxmodels]):
            data = self.getGutenbergMeta(f)
            if self.debugging:
                print '['+str(i)+'] Currently processing ', data, '...'
            models[data['title']] = WordTrackModel(data['title'], f, data, warriner)
            models[data['title']].distTrack(3)
        return models
    def getGutenbergMeta(self, f):
        data = {'title': 'xxx', 'author': 'xxx'}
        with open(f) as infile:
            for line in infile.readlines():
                if line.startswith('Title: '):
                    data['title'] = unicode(line[len('Title: '):].strip('\n'), "ascii", errors="ignore")
                if line.startswith('Author: '):
                    data['author'] = unicode(line[len('Author: '):].strip('\n'), "ascii", errors="ignore")
        return data
    def search_clusters(self, search, models=None, dist_away=2, orderby='importance', limit=20, exclusive=True):
        if not models:
            models = [m[1] for m in self.models.items()]
        near = dist_away
        clusters = {}
        similarclusters = []
        for model in models:
            clusters[model.name] = set([w[0] for w in model.sumNear(search, near, limit, orderby=orderby)])
            similarclusters = []
            for cluster1 in clusters:
                for cluster2 in clusters:
                    if cluster1 != cluster2:
                        if len(clusters[cluster1] & clusters[cluster2]) > 2:
                            newclust = ((cluster1, clusters[cluster1]), (cluster2, clusters[cluster2]))
                            if (newclust[1], newclust[0]) not in similarclusters:
                                similarclusters.append(newclust)
        if not exclusive:
            return similarclusters
        else:
            return [((cluster[0][0], cluster[1][0]), cluster[0][1] & cluster[1][1] )for cluster in similarclusters]
    
    def most_similar_to(self, search, limit=10):
        models = self.models
        if isinstance(search, str): 
            search = [search]
        similars = {}
        for model in models:
            similars[model] = Counter()
            numwords = float(len(models[model].wordlist()))
            for word in search:
                if word in models[model].words:
                    similars[model].update({word: math.log(1+models[model].words[word].count/numwords)})
        sortedsums = sorted([(sum(similars[w].values()), w, similars[w].items()) for w in similars], reverse=True)[:limit]
        return sortedsums
    
    def common_clusters(self, models, dist_away=2, orderby='importance', limit=20, exclusive=True):
        models = [m[1] for m in models]
        n = len(models)
        wordsets = [set(m.words.keys()) for m in models]
        commonwords = set.intersection(*wordsets)
        commonclusters = {}
        for word in commonwords:
            found = kb.search_clusters(word, models, dist_away, orderby, limit, exclusive)
            if len(found) > 0:
                if word not in commonclusters:
                    commonclusters[word] = []
                commonclusters[word].extend(found)

        # If there is n choose 2 entries, then all of them have commonalities with each other
        commonclusters = {word: commonclusters[word] for word in commonclusters if len(commonclusters[word]) >= n * (n-1) / 2}
        return commonclusters
    
    def common_cluster_words(self, models, dist_away=2, orderby='importance', limit=20, exclusive=True, join='union'):
        commonclusters = self.common_clusters(models, dist_away,orderby,limit,exclusive)
        wordsets = {}
        for word in commonclusters:
            wordsets[word] = commonclusters[word][0][1]
            for l in commonclusters[word]:
                if join == 'union':
                    wordsets[word] = wordsets[word] | l[1]
                if join == 'intersection':
                    wordsets[word] = wordsets[word] & l[1]
        return wordsets

## Continuing On: Filtering

The results I'm getting now are definitely interesting and hopefully useable. However the problem remains that many of the words are blande and pointless. 

### Tried

* Removing all words not in Warriner: Way too many good words get removed.
* Removing all words found in Warriner with arousal < 4: Good results! A few decent words cut, but many, many awful words as well. Also a tunable param.

In [49]:
kbw = KnowledgeBase(maxmodels=5, warriner=True)
kb = KnowledgeBase(maxmodels=5, warriner=False)

In [50]:
s1 = set([m[1] for m in kbw.models['The Return of Sherlock Holmes'].most_important()])
s2 = set([m[1] for m in kb.models['The Return of Sherlock Holmes'].most_important()])

## One Layer Deeper

Looking into how to go "one layer deeper"

* Find the common cluster words between two models
* For each word, examine their sumNear(N) and build a new set
* Return the intersection of those sets

This allows you to find words which, while not *directly* related, have a bit of a triangular dependency.

In [23]:
print kb.models.items()[0], kb.models.items()[1]
kb.common_cluster_words([kb.models.items()[0], kb.models.items()[1]])

(u'The Life and Adventures of Robinson Crusoe', <__main__.WordTrackModel instance at 0x7f24ba663ef0>) (u'The Iliad of Homer', <__main__.WordTrackModel instance at 0x7f24b8cfbb48>)


{'ago': {'long', 'may', 'one'},
 'aloud': {'god', 'thou', 'ye'},
 'ask': {'god', 'hast', 'thou'},
 'east': {'shore', 'south', 'west'},
 'float': {'high', 'one', 'shore'},
 'west': {'east', 'north', 'sea'}}

In [48]:
s1 = set([m[0] for m in kb.models.items()[0][1].words['sea'].sumNear(3).most_common()[:30]])
s2 = set([m[0] for m in kb.models.items()[0][1].words['shore'].sumNear(3).most_common()[:30]])
s1&s2

{'boat',
 'came',
 'could',
 'first',
 'go',
 'great',
 'island',
 'land',
 'little',
 'made',
 'might',
 'ship',
 'shore',
 'towards',
 'two',
 'upon',
 'way',
 'went',
 'would'}

## Thoughts moving forward

Must clear out poor words. They're really bad atm. Use Warriner to remove any found, bottom ones and potentially POS tagging to remove words tagged with a certain tag.

In [62]:
f = [1,2,3,4,5]
f[:len(f)]

[1, 2, 3, 4, 5]

In [7]:
dict(Counter([1,2,3,3,2,9]))

{1: 1, 2: 2, 3: 2, 9: 1}

In [None]:
stops = set(json.load(open('data/nltkstopwords.json', 'r')))
english = set(json.load(open('data/english.json', 'r')))
with open('models/warriner.csv', mode='r') as infile:
    reader = csv.reader(infile)
    next(reader)
    warriner = {rows[1]: {'valence': (float)(rows[2]), 'arousal': (float)(rows[5]), 'dominance': (float)(rows[8])} for rows in reader}
    
  
    def loadfile(self, f):
        lines = []
        text = []
        with open(f, mode='r') as infile:
            lines = infile.readlines()
        started = False
        ended = False
        for line in lines:
            if not started:
                if '*** START' in line or '***START' in line:
                    started = True
                continue
            if '*** END' in line or '***END' in line:
                break
            line = line.strip('\n')
            line = unicode(line, "ascii", errors="ignore")
            line =  re.sub("[^a-zA-Z]", " ", line)
            line = line.lower()
            text.extend(line.split())
        text = " ".join([w for w in text if self.suitableWord(w)])
        self.df.update(set(text.split()))
        return zlib.compress(text)
    
    def suitableWord(self, word):
        if word in self.stops:
            return False
        if word not in self.english:
            return False
        if self.filterwarriner:
            if word in WordTrackModel.warriner and WordTrackModel.warriner[word]['arousal'] < 4:
                return False
        return True