In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import jieba  # 中文分词库，百度员工开发
import matplotlib.pyplot as plt 
import time
# 查询系统可用的 GPU
physical_devices = tf.config.experimental.list_physical_devices('GPU')
# 确保有可用的 GPU 如果没有, 则会报错
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
# 设置参数,该段务必在运行jupyter的第一段代码执行，否则会无法初始化成功
# 仅在需要时申请显存空间（程序初始运行时消耗很少的显存，随着程序的运行而动态申请显存）
tf.config.experimental.set_memory_growth(physical_devices[0], True)

![文本表示的常用方法](./markdown_pics/文本表示的常用方法.png)

![文本表示方法的优缺点](./markdown_pics/文本表示方法的优缺点.png)

#### 1.词向量简介

![w2c的简单介绍](./markdown_pics/w2c的简单介绍.png)

#### 2.文本表示方法

#### 3.Word2vec词向量

#### 4.代码演示word2vec

In [2]:
train = pd.read_csv('/share/data/tutorial/cnews/train.tsv', sep = '\t', header= None, names = ['label','content'])
val = pd.read_csv('/share/data/tutorial/cnews/dev.tsv', sep = '\t', header= None, names = ['label','content'])
test = pd.read_csv('/share/data/tutorial/cnews/test.tsv', sep = '\t', header= None, names = ['label','content'])

In [3]:
train.head()

Unnamed: 0,label,content
0,体育,马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 来到沈阳，国奥队依然没有...
1,体育,商瑞华首战复仇心切 中国玫瑰要用美国方式攻克瑞典多曼来了，瑞典来了，商瑞华首战求3分的信心也...
2,体育,冠军球队迎新欢乐派对 黄旭获大奖张军赢下PK赛新浪体育讯12月27日晚，“冠军高尔夫球队迎新...
3,体育,辽足签约危机引注册难关 高层威逼利诱合同笑里藏刀新浪体育讯2月24日，辽足爆发了集体拒签风波...
4,体育,揭秘谢亚龙被带走：总局电话骗局 复制南杨轨迹体坛周报特约记者张锐北京报道 谢亚龙已经被公安...


In [4]:
train.tail()

Unnamed: 0,label,content
49995,财经,打好投资的决胜局□国泰基金 阿邰就长期而言，资产配置占投资成功与否的决定因素高达90%以上。...
49996,财经,昔明星基金今年首月负收益 QDII成今年胸闷基金□晨报记者 陈重博2009年以来，所有偏股型...
49997,财经,沪基指全周大涨8.25% 创两个月最大周涨幅全景网2月6日讯 受到A股市场牛年高歌猛进影响，...
49998,财经,沪基指半日涨2.01% 两市封基近乎全线上扬全景网2月6日讯 沪深基金指数周五早盘大幅收高，...
49999,财经,牛年第一月 开基抬头券商集合理财掉队每经记者 于春敏在金融危机的淫威之下，2008年，全球资...


In [5]:
print(
    train.shape,
    val.shape,
    test.shape
)

(50000, 2) (5000, 2) (10000, 2)


In [6]:
def content_cut(x):
    x = jieba.lcut(x)
    x = ' '.join(x)
    return x

In [7]:
%%time
train['content'] = train['content'].map(lambda x: content_cut(x))
val['content'] = val['content'].map(lambda x: content_cut(x))
test['content'] = test['content'].map(lambda x: content_cut(x))


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.653 seconds.
Prefix dict has been built successfully.
CPU times: user 4min 17s, sys: 591 ms, total: 4min 17s
Wall time: 4min 17s


In [8]:
df = pd.concat([train, val, test], axis = 0)

##### 训练Word2Vec

In [9]:
sentences = [document.split(' ') for document in df['content'].values]

In [10]:
%%time
model = Word2Vec(
    sentences= sentences,
    size = 200, # 维度
    alpha= 0.025, # 默认值
    window = 5, # 默认
    min_count=2,
    sample=0.001,
    seed = 2018,
    workers=11, # 线程
    min_alpha= 0.0001,
    sg = 0, # cbow
    hs = 0, # 负采样
    negative=5, # 负采样个数
    ns_exponent= 0.75,
    cbow_mean= 1, # 求和再取平均
    iter = 10, # 10到20
    compute_loss= True
)

In [11]:
# 保存模型
model.save('./word2vec/word2vec_word_200')

In [12]:
# 加载模型
model = Word2Vec.load('./word2vec/word2vec_word_200')

In [13]:
model.wv.vocab

at 0x7f202daebf70>,
 '合作意向': <gensim.models.keyedvectors.Vocab at 0x7f202daebfd0>,
 '鼎立': <gensim.models.keyedvectors.Vocab at 0x7f202daf0070>,
 '支持': <gensim.models.keyedvectors.Vocab at 0x7f202daf00d0>,
 '由': <gensim.models.keyedvectors.Vocab at 0x7f202daf0130>,
 '两位': <gensim.models.keyedvectors.Vocab at 0x7f202daf0190>,
 '体操': <gensim.models.keyedvectors.Vocab at 0x7f202daf01f0>,
 '担当': <gensim.models.keyedvectors.Vocab at 0x7f202daf0250>,
 '主持': <gensim.models.keyedvectors.Vocab at 0x7f202daf02b0>,
 '从始至终': <gensim.models.keyedvectors.Vocab at 0x7f202daf0310>,
 '沉浸': <gensim.models.keyedvectors.Vocab at 0x7f202daf0370>,
 '一片片': <gensim.models.keyedvectors.Vocab at 0x7f202daf03d0>,
 '欢声笑语': <gensim.models.keyedvectors.Vocab at 0x7f202daf0430>,
 '之中': <gensim.models.keyedvectors.Vocab at 0x7f202daf0490>,
 '游戏': <gensim.models.keyedvectors.Vocab at 0x7f202daf04f0>,
 '抽奖': <gensim.models.keyedvectors.Vocab at 0x7f202daf0550>,
 '环环相扣': <gensim.models.keyedvectors.Vocab at 0x7f202daf05b

In [14]:
# 查看任意一个单词所组成的向量的维度
model.wv['马晓旭'].shape

(200,)

In [25]:
model.wv['马晓旭']

array([-0.06560241,  0.00522172,  0.0441779 ,  0.02515583,  0.0065921 ,
       -0.00730217,  0.03604506,  0.01510331, -0.01108309, -0.03223479,
       -0.01468674,  0.11473027,  0.0091569 , -0.00713132,  0.07934288,
        0.07460576, -0.00985484, -0.00334083, -0.04284011,  0.01923379,
        0.0571684 ,  0.0776251 , -0.03577761,  0.01488583,  0.0044518 ,
       -0.07562221, -0.06494214,  0.00182385,  0.02360422, -0.03593749,
       -0.03949061, -0.02161311, -0.02560075, -0.02082567,  0.00655347,
       -0.02798013, -0.02101228, -0.02174476,  0.0133086 , -0.05627292,
       -0.06726563,  0.09481727,  0.0265637 ,  0.06044094,  0.06348162,
       -0.04999593,  0.04731578, -0.00211182, -0.02821376,  0.00978439,
       -0.00904767, -0.02803068,  0.00072595,  0.03980813, -0.07645328,
        0.00227302,  0.05218007,  0.01640033, -0.03961417, -0.0719063 ,
       -0.00873944, -0.07298563,  0.0542248 , -0.03318928, -0.04048835,
        0.00567577,  0.02233607,  0.00823338, -0.06511728,  0.03

In [15]:
# 和上述单词语义相似的单词查看
model.most_similar('马晓旭',topn = 20)

[('打骂', 0.4971315264701843),
 ('拜纳姆爱', 0.4586630165576935),
 ('狂训', 0.45029789209365845),
 ('挂彩', 0.431384801864624),
 ('国奥队', 0.40510937571525574),
 ('堕马', 0.4019997715950012),
 ('机组', 0.39724889397621155),
 ('女足', 0.3963756561279297),
 ('推挤', 0.3960060179233551),
 ('国家队', 0.3954906463623047),
 ('女篮', 0.3940105438232422),
 ('游览车', 0.3902229964733124),
 ('季前赛', 0.38945093750953674),
 ('三名', 0.38825157284736633),
 ('八强战', 0.3875080347061157),
 ('示威者', 0.3869211971759796),
 ('右腿', 0.38623011112213135),
 ('WCBA', 0.38533639907836914),
 ('助威声', 0.3849644064903259),
 ('足球队', 0.38406991958618164)]

In [21]:
# 计算两个单词之间的相似性
model.wv.similarity('马晓旭','国奥队')

0.40510938

In [17]:
# 查看词表
model.wv.vocab.keys()

essorize', 'juicy', 'couture', 'NineWest', '嫩女', 'Szohr', '风令', '熟辣女', '领小露', 'ECMall', 'Benoy', 'Astor', 'SEPHORA', '肴', '萨莉亚', '探网', '必瘦', 'BESTSELLER', '新辣道', '潮江春', '汇由', '丹棱', '街甲', '82483388', 'ecmall', '芭穿', '吉尼芬', '潮尚汇', '我爱宅', '欢爱', '或丝', '太大起', '有紧', '前味', '师所想', '中味', '体香', '后味', '黑眼珠', 'MiiA', '热裤装', '胸大腰', '靓妞', 'Annabelle', 'Neilson', 'cargo', 'Daman', '绞花', 'soso', '穿扮', '软雕塑', '佳影', 'CLUTCH', '锁边', '净色型', 'Dellal', 'PRES', '斗衣品', 'Sansone', 'Viviana', 'Volpicella', '而红袜', '打斜', '贺阳', 'Manel', 'Fabrican', 'Ebejer', '莫斯姐', '胸扮', '形衣', 'Agy', '无胸', '配及', '款日系', '上缘', 'Mulligan', '史丹', '特热', '浮夸风', '天冷些', 'versace', '王翠平', 'Zellweger', '这组街', '酷鞋', 'Donts', '胸距', '以裙', '最热露', '锐增', 'Allard', '刘心', 'Foursquare', '半长款', '宝中', '简或繁', '相展', '例牌', '无袜', 'Yurklevich', '扣长款', '战秀场', '耐冻型', '唇印', '超受', '换真', '驻邮办', '精多', '半透衫', '巧装', '手探', '贾蓓', '仿貂毛', '簇绒', '精裁', '少奶', '眼冬美人', 'hathaway', '臀腹', '充绒量', '克诺', 'Mariacarla', 'Boscono', 'Paves', '及花案', '柒牌', '藏肉', '裤爱', '徐红', 'Letizia'

In [22]:
# # 迭代模型
# sentences_next = []
# for document in test['word_seg'].tolist():
#     sentences_next.append(document.split(' '))

In [24]:
# model.train(sentences = sentences_next, total_examples = model.corpus_count, epochs = model.iter)

In [23]:
# model.save()

In [None]:
SELECT LATERAL VIEW EXPLODE(ARRAY(1, 2, 3)) AS c1;

In [2]:
import numpy as np

In [3]:
help(np.random.randint)

Help on built-in function randint:

randint(...) method of numpy.random.mtrand.RandomState instance
    randint(low, high=None, size=None, dtype=int)
    
    Return random integers from `low` (inclusive) to `high` (exclusive).
    
    Return random integers from the "discrete uniform" distribution of
    the specified dtype in the "half-open" interval [`low`, `high`). If
    `high` is None (the default), then results are from [0, `low`).
    
    .. note::
        New code should use the ``integers`` method of a ``default_rng()``
        instance instead; see `random-quick-start`.
    
    Parameters
    ----------
    low : int or array-like of ints
        Lowest (signed) integers to be drawn from the distribution (unless
        ``high=None``, in which case this parameter is one above the
        *highest* such integer).
    high : int or array-like of ints, optional
        If provided, one above the largest (signed) integer to be drawn
        from the distribution (see above fo

In [11]:
np.random.randint(low = 80, high= 110,size = (10,10))

array([[ 86,  82, 107,  98,  85,  93,  88,  97,  88,  96],
       [102,  90,  96, 107, 108, 101,  99,  94,  98,  97],
       [ 99,  87, 106, 107,  90,  92,  80,  86, 109,  92],
       [ 91,  99,  87,  94, 104,  99,  94,  83,  80,  86],
       [ 94,  83,  88, 106,  86,  92, 107,  86, 100, 105],
       [105,  83,  81, 106,  91, 105,  93,  99, 100,  94],
       [ 84, 105, 109,  83, 106,  87,  87,  87, 109, 103],
       [ 97,  85,  83,  87,  96, 104,  82, 103,  95,  99],
       [ 97,  83,  83,  99,  93,  84,  86, 104,  96,  80],
       [102,  89,  99,  84,  85,  92,  92, 102,  88, 100]])