# notebook概述：
- 第一部分：大致了解数据内容
- 第二部分: 比较几种工具的中文分词效果
- 第三部分：分词的分布统计探索

## 第一部分：

In [3]:
import pdb
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import warnings
from datetime import datetime
import matplotlib.pylab as plt

PATH = 'F:/codeGit/dataset/weibo/'

FILE_TRAIN = 'weibo_train_data.txt'
FILE_PREDICT = 'weibo_predict_data.txt'

In [4]:
def readTxtToDataframe(path_dataset, filename):
    const = open(path_dataset + filename).readlines()
    items = []
    
    for line in const:
        item = []
        elems = line.replace('\n', '').split('\t')
        elems[3] = int(elems[3])
        elems[4] = int(elems[3])
        elems[5] = int(elems[3])
        elems[6] = ''.join(elems[6: ])
        items.append(elems[:7])
        # pdb.set_trace()
    cols_name = ['uid', 'mid', 'time', 'forward_cout', 'comment_count', 'like_count', 'content']
    return DataFrame(items, columns = cols_name)

In [5]:
df = readTxtToDataframe(PATH, FILE_TRAIN)
print df[:5]

                                uid                               mid  \
0  d38e9bed5d98110dc2489d0d1cac3c2a  7d45833d9865727a88b960b0603c19f6   
1  fa13974743d3fe6ff40d21b872325e9e  8169f1d45051e08ef213bf1106b1225d   
2  da534fe87e7a52777bee5c30573ed5fd  68cd0258c31c2c525f94febea2d9523b   
3  e06a22b7e065e559a1f0bf7841a85c51  00b9f86b4915aedb7db943c54fd19d59   
4  f9828598f9664d4e347ef2048ce17734  c7f6f66044c0c5a3330e2c5371be6824   

                  time  forward_cout  comment_count  like_count  \
0  2015-02-23 17:41:29             0              0           0   
1  2015-02-14 12:49:58             0              0           0   
2  2015-03-31 13:58:06             0              0           0   
3  2015-06-11 20:39:57             0              0           0   
4  2015-03-10 18:02:38             0              0           0   

                                             content  
0  丽江旅游(sz002033)#股票##炒股##财经##理财##投资#推荐包赢股，盈利对半分成...  
1  #丁辰灵的红包#挣钱是一种能力，抢红包拼的是技术。我抢到了丁辰灵 和@阚洪岩 一起发出的现

## 第二部分：

In [7]:
import jieba as jb

In [5]:
help(jb)

Help on package jieba:

NAME
    jieba

FILE
    d:\anaconda2\lib\site-packages\jieba\__init__.py

PACKAGE CONTENTS
    __main__
    _compat
    analyse (package)
    finalseg (package)
    posseg (package)

CLASSES
    __builtin__.object
        Tokenizer
    
    class Tokenizer(__builtin__.object)
     |  Methods defined here:
     |  
     |  __init__(self, dictionary=None)
     |  
     |  __repr__(self)
     |  
     |  add_word(self, word, freq=None, tag=None)
     |      Add a word to dictionary.
     |      
     |      freq and tag can be omitted, freq defaults to be a calculated value
     |      that ensures the word can be cut out.
     |  
     |  calc(self, sentence, DAG, route)
     |  
     |  check_initialized(self)
     |  
     |  cut(self, sentence, cut_all=False, HMM=True)
     |      The main function that segments an entire sentence that contains
     |      Chinese characters into seperated words.
     |      
     |      Parameter:
     |          - sentence: 

In [25]:
seg_list1 = jb.cut(df.iloc[2]['content'], cut_all = False)
seg_list2 = jb.cut_for_search(df.iloc[2]['content'])
print "/".join(seg_list1)
print "/".join(seg_list2)

淘宝网/这些/傻/逼/。/。/。/气/的/劳资/有火/没/地儿/发/~/尼玛/，/你们/都/瞎/了
淘宝/宝网/淘宝网/这些/傻/逼/。/。/。/气/的/劳资/有火/没/地儿/发/~/尼玛/，/你们/都/瞎/了


In [46]:
import time

### 比较文本方法的速度：

In [None]:
words = []
t1 = time.time()
for i in range(100000):
    seg_list1 = jb.cut(df.iloc[i]['content'], cut_all = False)
    words.extend("/".join(seg_list1).split('/'))
print time.time() - t1

strs = ''
t1 = time.time()
for i in range(100000):
    strs = strs + df.iloc[i]['content'] + ' '
    seg_list = jb.cut(strs, cut_all = False)
    words = "/".join(seg_list1).split('/')
print time.time() - t1

第一种方法跑了40s左右，第二种方法跑了5分钟都没出结果。故采用第一种方法

# 第三部分：

In [61]:
nSample = 10000
words = []
blog_idxs = []
for i in range(nSample):
    seg_list1 = jb.cut(df.iloc[i]['content'], cut_all = False)
    words_line = "/".join(seg_list1).split('/')
    blog_idxs.extend([df.index[i] for j in range(len(words_line))])
    words.extend(words_line)

df_words = DataFrame(words, columns = ['word'])
df_words['blog_idx'] = Series(blog_idxs, index = df_words.index)

df_cnt = x.groupby(['word']).count().reset_index()
df_cnt.columns = ['word', 'cnt']
print df_cnt[:5]

  word  cnt
0       386
1       195
2    "    2
3    #   55
4   ##    4


    从上面的预览可以发现很多无意义的符号统计是最多的，而且这些符号有些并没有被切分出来，故需要在切割的时候为字典指定这些特殊符号。其次含有很多这些无意义的符号博文，大多数情况为一些诸如广告、推广文等没啥干货的垃圾博文，这点可以注意到。  
**故可以利用这些无意义的符号出现次数来筛选掉那些无人关注的垃圾博文**

In [63]:
cnt_up = int(df_cnt['cnt'].max() * 30 / 100)
cnt_down = int(df_cnt['cnt'].max() * 1 / 100)
print df_cnt[(df_cnt.cnt < cnt_up) & (df_cnt.cnt > cnt_down)]

        word  cnt
3          #   55
4         ##    4
7          (   12
8          )   12
11         ,   12
12         -    9
13         .   78
14       ...    6
43         3    6
47         4    5
55         6    5
60         :   68
61         ;    5
64         @   37
187        [   22
188        ]   22
189        _    5
197       cn   63
210     http   64
212   iPhone    4
236        t   63
246        ~    9
276        —    4
277        “   14
278        ”   12
279        …    7
280        ╭    4
281        ╮    4
283        、    9
284        。   67
...      ...  ...
1297      矢量    4
1299      知道    5
1327       等    4
1347      红包   14
1367      网易    5
1373      羊年    4
1375      美元    5
1390      而且    4
1391      而是    5
1401       能   10
1402      能力    5
1405       自    6
1407      自己    6
1417      苹果    4
1444      觉得    4
1468      试试    4
1472       说    4
1473       请    4
1515       还    4
1518       这    7
1521      这些    5
1544       都   22
1583      音乐    7
1617      

In [71]:
jb.load_userdict(PATH + 'rubbish.txt')

In [72]:
nSample = 10000
words = []
blog_idxs = []
for i in range(nSample):
    seg_list1 = jb.cut(df.iloc[i]['content'], cut_all = False)
    words_line = "/".join(seg_list1).split('/')
    blog_idxs.extend([df.index[i] for j in range(len(words_line))])
    words.extend(words_line)

df_words = DataFrame(words, columns = ['word'])
df_words['blog_idx'] = Series(blog_idxs, index = df_words.index)

df_cnt = x.groupby(['word']).count().reset_index()
df_cnt.columns = ['word', 'cnt']
print df_cnt[:5]

  word  cnt
0       386
1       195
2    "    2
3    #   55
4   ##    4
