# Purpose
Now we can index the weibo. We use Whoosh to do this. The documentation can be found here: https://whoosh.readthedocs.io/en/latest/quickstart.html

In [2]:
import os, os.path
from whoosh import index
from whoosh.fields import *
from whoosh.qparser import QueryParser
from whoosh.analysis import *
from whoosh import scoring
import csv
import pandas as pd
import numpy as np

# A quick example

In [9]:
from whoosh.index import create_in
from whoosh.fields import *
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)  
if not os.path.exists("indexdir4"):
    os.mkdir("indexdir4") 
ix = index.create_in("indexdir4", schema)
writer = ix.writer()
writer.add_document(title=u"First document", path=tags[1],
                     content=u"This is the first document we've added!")
writer.add_document(title=u"Second document", path=u"/b",
                     content=u"The second one is even more interesting!")
writer.commit()

In [10]:
from whoosh.qparser import QueryParser
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse("first")
    results = searcher.search(query, limit = None)
    print results[0]

<Hit {'path': u'nan', 'title': u'First document'}>


# Tutorial

## Design a schema
The schema lists the fields in the index. A field is a piece of information for each document in the index, such as its title or text content (you decide it!). Whoosh comes with some very useful predefined field types, and you can easily create your own.

## Create an index or open an existing index

## Add documents
OK, so we’ve got an Index object, now we can start adding documents. The writer() method of the Index object returns an IndexWriter object that lets you add documents to the index. The IndexWriter’s add_document(**kwargs) method accepts keyword arguments where the field name is mapped to a value.

Two important notes:

You don’t have to fill in a value for every field. Whoosh doesn’t care if you leave out a field from a document.
Indexed text fields must be passed a unicode value. Fields that are stored but not indexed (STORED field type) can be passed any pickle-able object.

# Start from here

# Prepare the documents

In [3]:
#df = pd.read_csv('ready_for_index_short')
df = pd.read_csv('ready_for_index_long')
print len(df)
df.head()

892539


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,time,userid,hashtag,cleaned_text,is_oversea
0,0,0,2012-09-03 00:03:19,uWWBCGJE4,,冷言冷语继续这样很好,0
1,1,1,2012-09-03 00:04:12,u1O52TEBM,,傻子过去就过去了越想只会让自己越来越TM看不起自己恨自己不是吗有什么了不起的人不都是在挫折中...,1
2,2,2,2012-09-03 00:02:10,u0AGHHPXK,,闲来无事想破人QQ空间密码于是点击了一个美女的空间提问请问男生最喜欢什么花我暗香肯定是玫瑰因...,0
3,3,3,2012-09-03 00:07:36,uVGJLC4S5,,北京今天泛起秋意,0
4,4,4,2012-09-03 02:09:32,uATZPTH53,,,0


In [4]:
tags = df['hashtag'].tolist()
tags = [str(tag) for tag in tags]   
tags = [unicode(tag, 'utf-8') for tag in tags]
print tags[125]
print tags[1]

静观NewsDesign 
nan


In [5]:
texts = df['cleaned_text'].tolist()
texts = [str(text) for text in texts]   
texts = [unicode(text, 'utf-8') for text in texts]
print texts[3]

北京今天泛起秋意


In [6]:
times = df['time'].tolist()
times[3]

'2012-09-03 00:07:36'

In [7]:
no = df['Unnamed: 0'].tolist()
no[3]

3

In [8]:
users = df['userid'].tolist()
users = [str(user) for user in users]   
users = [unicode(user, 'utf-8') for user in users]
users[3]

u'uVGJLC4S5'

In [9]:
overseas = df['is_oversea'].tolist()
overseas[1]

1

In [24]:
"""""
my_analyzer = StandardAnalyzer() | NgramFilter(minsize=2, maxsize=4)
[token.text for token in my_analyzer(u"另外一部分原因")]

[u'\u53e6\u5916',
 u'\u53e6\u5916\u4e00',
 u'\u53e6\u5916\u4e00\u90e8',
 u'\u5916\u4e00',
 u'\u5916\u4e00\u90e8',
 u'\u5916\u4e00\u90e8\u5206',
 u'\u4e00\u90e8',
 u'\u4e00\u90e8\u5206',
 u'\u4e00\u90e8\u5206\u539f',
 u'\u90e8\u5206',
 u'\u90e8\u5206\u539f',
 u'\u90e8\u5206\u539f\u56e0',
 u'\u5206\u539f',
 u'\u5206\u539f\u56e0',
 u'\u539f\u56e0']

# Index

In [9]:
# verification
print len(no)
print len(times)
print len(texts)
print len(users)
print len(tags)
print len(overseas)

892539
892539
892539
892539
892539


In [16]:
# create a schema
schema = Schema(no=NUMERIC(stored=True), time=DATETIME(stored=True), userid=ID(stored=True), tag=KEYWORD (scorable=True),
               content=TEXT(analyzer = StandardAnalyzer() | NgramFilter(minsize=2, maxsize=4)), oversea=NUMERIC(stored=True))
# create an index
if not os.path.exists("indexdir_long"):
    os.mkdir("indexdir_long")
ix = index.create_in("indexdir_long", schema)
writer = ix.writer()   # allow for adding documents to the index
for i in range(len(df)):
    writer.add_document(no=no[i], time = times[i], content=texts[i], userid=users[i], tag=tags[i], oversea = overseas[i])
writer.commit()   #save the documents to the index

# Search

In [16]:
# make sure to specify the directary of the index
ix1 = index.open_dir("indexdir_long")
with ix1.searcher(weighting=scoring.TF_IDF()) as searcher:  # the default is BM25F
    query = QueryParser("tag", ix1.schema).parse("万名抹茶粉女郎".decode('utf-8'))
    results = searcher.search(query, limit = None)
    for i in range(len(results)):
        print results[i]

In [13]:
with ix1.searcher(weighting=scoring.TF_IDF()) as searcher:
    query = QueryParser("content", ix1.schema).parse("转发微博师傅".decode('utf-8'))
    results = searcher.search(query, limit = None)
    for i in range(len(results)):
        print results[i]

<Hit {'time': '2012-09-05 23:03:09', 'userid': u'uB5NY1F3J', 'no': 219372}>


In [4]:
with ix1.searcher(weighting=scoring.TF_IDF()) as searcher:
    query = QueryParser("content", ix1.schema).parse("国民教育".decode('utf-8'))
    results = searcher.search(query, limit = None)  # By default the results contains at most the first 10 matching documents
    #print (len(results)
    #print results.scored_length()
    for i in range(len(results)):
        print results[i]

<Hit {'time': '2012-09-08 20:45:34', 'userid': u'uII5MEOLS', 'no': 581361}>
<Hit {'time': '2012-09-05 09:34:45', 'userid': u'uB4GXZG42', 'no': 133836}>
<Hit {'time': '2012-09-08 22:22:26', 'userid': u'uDSLYJ0U', 'no': 837583}>
<Hit {'time': '2012-09-08 12:09:00', 'userid': u'u1CR4ZNSU', 'no': 391356}>
<Hit {'time': '2012-09-08 10:40:54', 'userid': u'uP2ZY3D4N', 'no': 533320}>
<Hit {'time': '2012-09-03 21:23:32', 'userid': u'u3JHQFJYQ', 'no': 54395}>
<Hit {'time': '2012-09-04 20:47:10', 'userid': u'uTS2M5Z2Z', 'no': 132443}>
<Hit {'time': '2012-09-09 01:12:30', 'userid': u'uK3X2L2EL', 'no': 151830}>
<Hit {'time': '2012-09-05 10:32:36', 'userid': u'uRSRDWUTV', 'no': 278154}>
<Hit {'time': '2012-09-06 19:50:41', 'userid': u'uQHC5UADA', 'no': 308809}>
<Hit {'time': '2012-09-05 11:35:51', 'userid': u'uCBTJSO3R', 'no': 328603}>
<Hit {'time': '2012-09-09 16:03:22', 'userid': u'uJWAX20NZ', 'no': 372664}>
<Hit {'time': '2012-09-08 03:50:59', 'userid': u'uLCMYSCUD', 'no': 375533}>
<Hit {'time': 

In [35]:
# read file
data = []
with open('categories.txt') as f:
    for line in f:
        line = line.split('\n')[0]
        data.append(line)

In [36]:
print len(data)
data[1]

343483


'\xe6\x97\xa5\xe6\x9c\xac\xe8\x81\x8c\xe4\xb8\x9a\xe6\x91\x94\xe8\xa7\x92|\xe6\x97\xa5\xe6\x9c\xac\xe8\x81\x8c\xe4\xb8\x9a\xe6\x91\x94\xe8\xa7\x92\xe9\x80\x89\xe6\x89\x8b'

In [37]:
#split each line by '|'
data2 = []
for line in data:
    cat = line.split('|')
    data2.append(cat)

In [38]:
print len(data2)
data2[:2]

343483


[['\xe5\x8c\xbb\xe9\x99\xa2\xe7\xae\xa1\xe7\x90\x86\xe5\xb1\x80',
  '\xe9\xa6\x99\xe6\xb8\xaf\xe5\x85\xac\xe7\xab\x8b\xe5\x8c\xbb\xe9\x99\xa2'],
 ['\xe6\x97\xa5\xe6\x9c\xac\xe8\x81\x8c\xe4\xb8\x9a\xe6\x91\x94\xe8\xa7\x92',
  '\xe6\x97\xa5\xe6\x9c\xac\xe8\x81\x8c\xe4\xb8\x9a\xe6\x91\x94\xe8\xa7\x92\xe9\x80\x89\xe6\x89\x8b']]

In [41]:
categories = []
for line in data2:
    for cat in line:
        categories.append(cat)

In [42]:
print len(categories)
for i in categories[:10]:
    print i

686966
医院管理局
香港公立医院
日本职业摔角
日本职业摔角选手
日本围棋
本因坊
日本围棋
林家 (围棋)
日本围棋
本因坊家


In [43]:
categories = set(categories)
print len(categories)

177796


In [None]:
with ix1.searcher(weighting=scoring.TF_IDF()) as searcher:  # the default is BM25F
    query = QueryParser("tag", ix1.schema).parse("万名抹茶粉女郎".decode('utf-8'))
    results = searcher.search(query, limit = None)
    for i in range(len(results)):
        print results[i]