In [6]:
import redis
import time
import datetime
import bisect
import os
import json
import re
import uuid

In [2]:
conn = redis.Redis(decode_responses=True)

In [4]:
STOP_WORDS = set('''able about across after all almost also am among
an and any are as at be because been but by can cannot could dear did
do does either else ever every for from get got had has have he her
hers him his how however if in into is it its just least let like
likely may me might most must my neither no nor not of off often on
only or other our own rather said say says she should since so some
than that the their them then there these they this tis to too twas us
wants was we were what when where which while who whom why will with
would yet you your'''.split()) 
WORDS_RE = re.compile("[a-z']{2,}")

In [5]:
def tokenize(content):
    words = set()
    for match in WORDS_RE.finditer(content.lower()):
        word = match.group().strip("'")
        if len(word) > 2:
            words.add(word)
    return words - STOP_WORDS

def index_document(conn, docid, content):
    words = tokenize(content)
    pipe = conn.pipeline(True)
    for word in words:
        pipe.sadd('idx:'+word, docid)
    return len(pipe.execute())

In [7]:
def _set_common(conn, method, names, ttl=30, execute=True):
    id = str(uuid.uuid4())
    pipe = conn.pipeline(True) if execute else conn
    names = ['idx:'+name for name in names]
    getattr(pipe, method)('idx:'+id, *names)
    pipe.expire('idx:'+id, ttl)
    if execute:
        pipe.execute()
    return id

# 交集
def intersect(conn, items, ttl=30, _execute=True):
    return _set_common(conn, 'sinterstore', items, ttl, _execute)

# 并集
def union(conn, items, ttl=30, _execute=True):
    return _set_common(conn, 'sunionstore', items, ttl, _execute)

# 差集
def difference(conn, item, ttl=30, _execute=True):
    return _set_common(conn, 'sdiffstore', items, ttl, _execute)

In [11]:
QUERY_RE = re.compile("[+-]?[a-z']{2,}")

def parse(query):
    unwatched = set()
    all = []
    current = set()
    for match in QUERY_RE.finditer(query.lower()):
        word = match.group()
        prefix = word[:1]
        if prefix in '+-':
            word = word[1:]
        else:
            prefix = None
        word = word.strip("'")
        if len(word) < 2 or word in STOP_WORDS:
            continue
        if prefix == '-':
            unwatched.add(word)
            continue
        if current and not prefix:
            all.append(list(current))
            current = set()
        current.add(word)
    if current:
        all.append(list(current))
    return all, list(unwatched)

In [12]:
parse('''
conect +connection +disconnect +disconnect
chat
-proxy -proxies
''')

([['conect', 'disconnect', 'connection'], ['chat']], ['proxies', 'proxy'])

In [13]:
def parse_and_search(conn, query, ttl=30):
    all, unwatched = parse(query)
    if not all:
        return None
    to_intersect = []
    for syn in all:
        if len(syn) > 1:
            to_intersect.append(union(conn, syn, ttl=ttl))
        else:
            to_intersect.append(syn[0])
        
        if len(to_intersect) > 1:
            intersect_result = intersect(conn, to_intersect, ttl=ttl)
        else:
            intersect_result = to_intersect[0]
        
        if unwatched:
            unwatched.insert(0, intersect_result)
            return difference(conn, unwatched, ttl=ttl)
        return intersect_result

In [44]:
index_document(conn, 'doc1', '''#B Set up a transactional pipeline so that we have consistent results for each individual call
#C Add the 'idx:' prefix to our terms
#D Set up the call for one of the operations
#E Instruct Redis to expire the SET in the future
#F Actually execute the operation
#G Return the id for the caller to process the results
#H Helper function to perform SET intersections
#I Helper function to perform SET unions
#J Helper function to perform SET differences''')

30

In [45]:
index_document(conn, 'doc2', '''#A Our regular expression for finding wanted, unwanted, and synonym words
#B A unique set of unwanted words Return
#C Our final result of words that we are looking to intersect
#D The current unique set of words to consider as synonyms
#E Iterate over all words in the search query
#F Discover +/- prefixes, if any
#G Strip any leading or trailing single quotes, and skip anything that is a stop word
#H If the word is unwanted, add it to the unwanted set
#I Set up a new synonym set if we have no synonym prefix and we already have words
#J Add the current word to the current set
#K Add any remaining words to the final intersection
#END''')

39

In [21]:
parse_and_search(conn,'current +unique')

'4368b4e8-45e6-4b0d-9227-62aa0a097170'

In [23]:
conn.smembers('idx:4368b4e8-45e6-4b0d-9227-62aa0a097170')

{'doc2'}

In [24]:
def search_and_sort(conn, query, id=None, ttl=300, sort='-updated',
                   start=0, num=20):
    desc = sort.startswith('-')
    sort = sort.lstrip('-')
    by = 'kb:doc:*->' + sort
    alpha = sort not in ('updated', 'id', 'created')
    if id and not conn.expire(id, ttl):
        id = None
    if not id:
        id = parse_and_search(conn, query, ttl=ttl)
        
    pipe = conn.pipeline(True)
    pipe.scard('idx:'+id)
    pipe.sort('idx:'+id, by=by, alpha=alpha, desc=desc, 
              start=start, num=num)
    results = pipe.execute()
    return results[0], results[1], id

In [25]:
search_and_sort(conn,'current +unique')

(1, ['doc2'], 'e01e4a3c-d63e-438b-9903-54d4488977ec')

# 有序索引

In [48]:
def _zset_common(conn, method, scores, ttl=30, **kw):
    id = str(uuid.uuid4())
    execute = kw.pop('_execute', True)
    pipe = conn.pipeline(True) if execute else conn
    for key in scores.keys():
        scores['idx:'+key] = scores.pop(key)
    print(scores)
    getattr(pipe, method)('idx:'+id, scores, **kw)
    pipe.expire('idx:'+id, ttl)
    if execute:
        pipe.execute()
    return id

def zintersect(conn, items, ttl=30, **kw):
    return _zset_common(conn, 'zinterstore', dict(items), ttl, **kw)

def zunion(conn, items, ttl=30, **kw):
    return _zset_common(conn, 'zunionstore', dict(items), ttl, **kw)

In [49]:
def search_and_zsort(conn, query, id=None, ttl=300, update=1,
                    vote=0, start=0, num=20, desc=True):
    if id and not conn.expire(id, ttl):
        id = None
    if not id:
        id = parse_and_search(conn, query, ttl=ttl)
        scored_search = {
            id: 0,
            'sort:update': update,
            'sort:votes': vote
        }
        id = zintersect(conn, scored_search, ttl)
    
    pipe = conn.pipeline(True)
    pipe.zcard('idx:'+id)
    if desc:
        pipe.zrevrange('idx:'+id, start, start+num-1)
    else:
        pipe.zrange('idx:'+id, start, start+num-1)
    results = pipe.execute()
    return results[0], results[1], id

In [59]:
conn.zadd('idx:sort:update', {'doc1': 1, 'doc2': 10})
conn.zadd('idx:sort:votes', {'doc1': 10, 'doc2': 1})
search_and_zsort(conn,'set +return')

(2, ['doc2', 'doc1'], '469ea137-5588-4749-8c5a-07708ba4f720')

In [52]:
conn.zinterstore('test', dict({'idx:c83c4224-fcdc-457f-b08c-199320752e8f': 0, 'idx:sort:update': 1, 'idx:sort:votes': 0}))

0

In [54]:
conn.zrange('test',0,-1)

[]

In [None]:
conn.zinterstore('test',{'id':1,'id'})