## 查看数据

In [1]:

!echo '数据集行数:'
!wc -l 'data/bioCorpus_5000.txt'
!echo '======'
!echo '数据集前10行'
!head -10 'data/bioCorpus_5000.txt'


数据集行数:
5000 data/bioCorpus_5000.txt
数据集前10行
formate assay in body fluids  application in methanol poisoning.
delineation of the intimate details of the backbone conformation of pyridinenucleotide coenzymes in aqueous solution.
metal substitutions incarbonic anhydrase  a halide ion probe study.
effect of chloroquine on cultured fibroblasts  release of lysosomal hydrolasesand inhibition of their uptake.
atomic models for the polypeptide backbones of myohemerythrin and hemerythrin.
studies of oxygen binding energy to hemoglobin molecule.
maturation of the adrenal medulla--IV
effects of morphine.
comparison between procaine and isocarboxazid metabolism in vitro by a livermicrosomal amidase-esterase.
radiochemical assay of glutathione S-epoxide transferase and its enhancement byphenobarbital in rat liver in vivo.


## 训练模型

In [2]:
from gensim.models import word2vec 

In [3]:
# 用生成器的方式读取文件里的句子
# 适合读取大容量文件，而不用加载到内存
class MySentences(object):
    def __init__(self, fname):
        self.fname = fname    
    def __iter__(self):
        for line in open(self.fname,'r'):     
            yield line.split()

In [4]:
# 模型训练函数
def w2vTrain(f_input, model_output):         
    sentences = MySentences(DataDir+f_input)
    w2v_model = word2vec.Word2Vec(sentences, 
                                  min_count = MIN_COUNT, 
                                  workers = CPU_NUM, 
                                  size = VEC_SIZE,
                                  window = CONTEXT_WINDOW
                                 )

    w2v_model.save(ModelDir+model_output)

### 参数说明：

min_count: 对于词频 < min_count 的单词，将舍弃（其实最合适的方法是用 UNK 符号代替，即所谓的『未登录词』，这里我们简化起见，认为此类低频词不重要，直接抛弃）

workers: 可以并行执行的核心数，需要安装 Cython 才能起作用（安装 Cython 的方法很简单，直接 pip install cython）

size: 词向量的维度，即参考资料[3.]所提到的神经网络隐层节点数

window: 目标词汇的上下文单词距目标词的最长距离，很好理解，比如 CBOW 模型是用一个词的上下文预测这个词，那这个上下文总得有个限制，如果取得太多，距离目标词太远，有些词就没啥意义了，而如果取得太少，又信息不足，所以 window 就是上下文的一个最长距离

In [5]:
# 训练
DataDir = "./data/"
ModelDir = "./ipynb_garbage_files/"
MIN_COUNT = 4
CPU_NUM = 2 
# 需要预先安装 Cython 以支持并行
VEC_SIZE = 20
CONTEXT_WINDOW = 5 # 提取目标词上下文距离最长5个词
f_input = "bioCorpus_5000.txt"
model_output = "test_w2v_model"

w2vTrain(f_input, model_output)

## 模型评估

In [6]:
w2v_model = word2vec.Word2Vec.load(ModelDir+model_output)

In [7]:
w2v_model.most_similar('body')

  """Entry point for launching an IPython kernel.


[('adrenergic', 0.9992793798446655),
 ('the', 0.9990754127502441),
 ('its', 0.9990279674530029),
 ('a', 0.9990085363388062),
 ('mice', 0.9990078210830688),
 ('studies', 0.9990071058273315),
 ('plasma', 0.9989837408065796),
 ('human', 0.9989758729934692),
 ('metabolic', 0.9989551305770874),
 ('purification', 0.9989367723464966)]

In [8]:
w2v_model.most_similar('heart') 


  """Entry point for launching an IPython kernel.


[('respiratory', 0.9997044205665588),
 ('in', 0.999563992023468),
 ('liver', 0.9995449781417847),
 ('beta', 0.9995213747024536),
 ('effect', 0.99949049949646),
 ('by', 0.9994483590126038),
 ('from', 0.9994480013847351),
 ('cells.', 0.9994244575500488),
 ('synthesis', 0.9994019269943237),
 ('to', 0.9993903636932373)]

## 模型调优

1. 调节参数，比如 vec_size 设置的太小，导致这 20 个维度不足以 capture单词间不同的信息，所以我们需要继续调整超参数

2. 数据集较小，因此停止词占据了太多信息量

下面去除停止词，在来

In [9]:
# 停止词
from nltk.corpus import stopwords
StopWords = stopwords.words('english')

In [10]:
StopWords[:20]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

In [11]:
# 重新训练# 模型训练函数
def w2vTrain_removeStopWords(f_input, model_output):         
    sentences = list(MySentences(DataDir+f_input))    
    for idx,sentence in enumerate(sentences):

        sentence = [w for w in sentence if w not in StopWords]

        sentences[idx]=sentence

    w2v_model = word2vec.Word2Vec(sentences, 
                                  min_count = MIN_COUNT, 
                                  workers = CPU_NUM, 
                                  size = VEC_SIZE)

    w2v_model.save(ModelDir+model_output)

w2vTrain_removeStopWords(f_input, model_output)

w2v_model = word2vec.Word2Vec.load(ModelDir+model_output)

In [12]:
w2v_model.most_similar('body')

# 结果一般
# [('relationship', 0.9543654918670654),
#   ('plasma', 0.9490970373153687),

#  ('two', 0.9482829570770264),

#  ('blood', 0.9451138973236084),

#  ('structure', 0.9415417909622192),

#  ('properties', 0.9410394430160522),

#  ('human', 0.9409817457199097),

#  ('cardiac', 0.9402023553848267),

#  ('effect', 0.9401187896728516),

#  ('response', 0.9397702217102051)]

  """Entry point for launching an IPython kernel.


[('adrenergic', 0.9358120560646057),
 ('affinity', 0.9273056983947754),
 ('mice', 0.9262033700942993),
 ('human', 0.9209858179092407),
 ('blood', 0.9197221994400024),
 ('sodium', 0.9197088479995728),
 ('cardiac', 0.9190305471420288),
 ('studies', 0.917290210723877),
 ('plasma', 0.9153444170951843),
 ('isolated', 0.9125454425811768)]

In [13]:
w2v_model.most_similar('heart')

  """Entry point for launching an IPython kernel.


[('respiratory', 0.9833990931510925),
 ('use', 0.967116117477417),
 ('effect', 0.9670897722244263),
 ('liver', 0.9662567377090454),
 ('serum', 0.965541422367096),
 ('beta', 0.9634912014007568),
 ('inhibition', 0.9626436233520508),
 ('studies', 0.9618862271308899),
 ('oxygen', 0.9611330032348633),
 ('action', 0.9603637456893921)]

In [16]:
w2v_model.wv.vectors

array([[-0.04432453,  0.14793128, -0.2648956 , ...,  0.20970087,
         0.29673585, -0.02161907],
       [-0.00896221,  0.0889053 , -0.1537925 , ...,  0.12604086,
         0.16731   , -0.01607072],
       [-0.00063231,  0.07445265, -0.1819088 , ...,  0.15033329,
         0.15327007, -0.00885514],
       ...,
       [-0.01280354,  0.00303587, -0.0143146 , ...,  0.01704073,
         0.03326512, -0.01437898],
       [ 0.00781064,  0.01315594,  0.00315767, ...,  0.00974985,
         0.01723483, -0.00123381],
       [ 0.01814329, -0.01293619,  0.00404376, ...,  0.03012145,
        -0.00267913, -0.01139108]], dtype=float32)

In [19]:
from tqdm import tqdm
for i in tqdm(range(10000)):
     a=1

100%|██████████| 10000/10000 [00:00<00:00, 2195740.76it/s]
