In [1]:
import numpy as np
import keras
from keras import layers


Using TensorFlow backend.


## 下载文本数据

In [2]:
path = keras.utils.get_file("nietzsche.txt",origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read().lower()#一个大字符串
print('Corpus length:',len(text))

Downloading data from https://s3.amazonaws.com/text-datasets/nietzsche.txt
Corpus length: 600893


In [4]:
#path = keras.utils.get_file("wiki.zh.vec",origin="https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.zh.vec")

In [6]:
print(type(text))
print(text[:10])

<class 'str'>
preface





需要将语料整合成一个大长串[因为是序列文本数据]。

## 数据向量化one-hot

指定单个句子长度，生成训练样本和对应标签。

In [7]:
maxlen = 60#单个句子长度： 60个字符char级别
step = 3
sentences = []#训练样本
nextChars = []#targets

#生成数据集
for i in range(0,len(text)-maxlen,step):#间隔取样
    sentences.append(text[i:i+maxlen])#句子
    nextChars.append(text[i+maxlen])#下一个字符
    
print("Number of sentences:",len(sentences))
#获取所有字符字典
chars = sorted(list(set(text)))#26个英文字母+其他
charIndices = dict((char,chars.index(char)) for char in chars)#字符--id 对应字典：方便最后对应到字符上

#one-hot
print("Vectorization...")
x = np.zeros((len(sentences),maxlen,len(chars)),dtype=np.bool)# 3维：所有句子-当前句子-单个词
y = np.zeros((len(sentences),len(chars)),dtype=np.bool)#最后相当于一个分类，分类数目为字符种类数

for i, sentence in enumerate(sentences):#one-hot
    for j, char in enumerate(sentence):
        x[i,j,charIndices[char]] = 1
        y[i,charIndices[char]] = 1
print("Vectorization Finished")

Number of sentences: 200278
Vectorization...
Vectorization Finished


## 模型构建

In [8]:
model = keras.models.Sequential()
model.add(layers.LSTM(128,input_shape=(maxlen,len(chars))))
model.add(layers.Dense(len(chars),activation='softmax'))

optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy',optimizer=optimizer)

## 字符采样

根据模型生成结果，对其进行采样，生成最后的文本序列。

定义temperature，设定随机性的大小；在随机性和可能性之间权衡。


In [9]:
def sample(preds, temperature=1.0):
    """
    preds:模型预测结果，softmax 1D数组
    temperature: 随机性大小；越大，取样结果越随机，创造性更强；越小，结果更可信，连贯一致性更好
    ----
    np.random.multinomial(n,pvals,size)多项式分布取样；；；eg：掷骰子
    n：进行n次实验；
    pvals：每个结果的概率；float序列，和为1；
    size：采样结果尺寸；进行多少次采样-----每次采样结果都是所有可能性的概率 列表
    """
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)#重新加权调整；确保所有和为1
    probas = np.random.multinomial(1, preds, 1)#从多项式分布中采样，得到一次采样结果
    
    return np.argmax(probas)#返回概率最大的字符下标：对采样结果选择最大的，返回下标

## 序列文本生成

多次epochs，设定多个temperature，观察结果变化情况。

In [None]:
import random
import sys

for epoch in range(1, 60):
    print('epoch', epoch)
    model.fit(x, y, batch_size=128, epochs=1)#一次训练
    start_index = random.randint(0, len(text) - maxlen - 1)#随机找一个种子样本
    generated_text = text[start_index:start_index+maxlen]#种子文本
    print('--- Generating with seed: "' + generated_text + '"')

    for temperature in [0.2, 0.5, 1.0, 1.2]:#不同temperature生成文本对比
        print('------ temperature:', temperature)
        sys.stdout.write(generated_text)
        for i in range(400):#从种子文本开始，生成400个字符
            sampled = np.zeros((1, maxlen, len(chars)))#种子文本one-hot编码
            for t, char in enumerate(generated_text):
                sampled[0, t, charIndices[char]] = 1.

            preds = model.predict(sampled, verbose=0)[0]#下一个字符预测softmax结果
            next_index = sample(preds, temperature)#使用temperature对结果进行采样
            next_char = chars[next_index]#找到id对应字符
            generated_text += next_char#添加到之前文本中
            generated_text = generated_text[1:]#新的文本数据
            sys.stdout.write(next_char)#预测生成的字符

epoch 1
Epoch 1/1
--- Generating with seed: "s, all the meanness, all
the semi-animal poverty of their so"
------ temperature: 0.2
s, all the meanness, all
the semi-animal poverty of their sofr- ls s-en-vnry
mma-a-r
rv -ieny-h--vytsmoo
tyl-
yoi
v
mon-tvnrateyon-la yeaes
vrt mtitvaoehsorvlalyr vhta-rvi
se-
-reevlnm-ve-n-ase-v-r-lvl-nsisro
-vs am-hrsstrae r mt-rvlt-s-re tttshnh-vl-vlsvree-r-avoetvtl-hmtreiih-n-vnai-ttre n-r-eils-r--em-l-l-lomylnl-l-heevrs-eeore-a-ol nv-o-vrt-slllvlt-v-a-n-s a-o t-eolssyr-v-tellva-e-i-i-s-ln-l-lay-hioyerynlsiv-i- - -r-ivolt-ty-lvly-vn-ooer
yv-l-l-thr-rvi------ temperature: 0.5
ay-hioyerynlsiv-i- - -r-ivolt-ty-lvly-vn-ooer
yv-l-l-thr-rvith-e
v
et
vyneehnviir
-svrh te
vthaleyaa
hatr voree-snh- n-e
 yori-v
yvleevy oovrsnhytys
i
vnev


ht
earyl
t
toinsrr-i

aa
hen
att
i-h

 l notl-nh
siha
-
vhhyhs
-asoeys nv
i
a

s-a yvtr
s
ra
an-v 
r
vovtnnre
ven-y
-
insnoh-
te
r

sh-
on-tovint-nvs
ioi
oh
v
ns-nsle
rrhysiirhheh  itriin-v-ve-

itvs-
sr
-
htvotstryy-troa-ts

 

nrnrorfeiobebsoueeenabbuiiausofuoitrrftiobeon rtrrutuf senntnratabsoftr fiuys eouererseeisbrrruoaitbinb rntftoinstb utiera eueu ua a i biftn trnsstf iuoeaurrn mamisaiibubfimbaou bfs uubs tuootuuiiniuoofaosmii   oio ueentniotstowioinmbenmrunst musnuefmbaeaartmeesbowe anuwe mrmm e  fbreomr euwmuwnsftioabisbfitwoubrmut aoosanraromtsefuwbnamse nnfsusubftnora ifub snmmtwmimwutaobrn tbmubntb wbnunefaeesofsrrfsamnmoubmuomemenstanib esrnmswi  w tsnunnorrbbibmmrwoiepoch 5
Epoch 1/1
--- Generating with seed: "een most despised by
woman herself, and not at all by us?--w"
------ temperature: 0.2
een most despised by
woman herself, and not at all by us?--wan-dbfr?m?iwutt-
??
ssarwmiyolsem?ebs
reuaubbaswbbyu?-tnwwtosobb-
? a?iwlsbwtruu?beneysnam?oo-tstsi-ure?? s?wi
a wobobtlew-bu?wtt-iw-b-aby  nnswta ttyb?
?w-
ssyt?? wwaw??nbs
-esa??ie?usts?usnty
s?nwwwbub t
tue

enw?uwwbe? 

yeuw?swoio?uuu
ystyew?s
teb b
?
b?
tssuwwesweub enwbtnob
?utybthstsi byehbtybba?t ?tes
y  oao??suobiyut?usob?ssny
a ?uteb
s
e

t iew deeda,s re,roi,ean,et,wtsoi w dttiersiaian,oestraweette,,ppsadn,at,ar, ,sprir,n,ops,piws nadswron ssnpasiwswwarwrwtapeeeenoo,ietesatnaeesesi ntwi,ooen,iwpoo,we,r,paeae,s t,ipripows,nr,psoeoe t p eoeeano,rt,aalrroelesonc,norpoennei cspwt,opawpts,nier  wion r ccopaec wpcic,cinnairsswp,a  ppcew,net-,ieets- sssaos,r ,nc,p,atap,rneritn-cneoeetec,socrsices-cps,snees-n-p-isepernecro--tnpisnsn nro-airpp-,cnnapepica,,ocpi- tp,trspe-p- ratarnp-is-iti=pstrpr=r,------ temperature: 1.2
irpp-,cnnapepica,,ocpi- tp,trspe-p- ratarnp-is-iti=pstrpr=r,(enrrp sten-cenrip-nat(e(-(t(((p o(acfeee(atfsii-toort pmfiraopm tna -m(tf(ac (aiiss(mrrotnr-rmtsprirppcn(p(-riopers(fo fotatcop(ntoomcsi os(sammti(a ser((snipenrmoa(emirmp-a-snnna(ao(nmaiateirmn(onmna(saio-mtfnitrtir-(s-foen(ipmso-epssreppift mr mii m-a(-onna fao-enmirroenmpai(mf feompatpnfenoimf(
(e 
n  io-f ( f(oi=s-emop 
uf(e-f(
stfo(
=tnfu(-o
o(nn
n- aptmmi =1sa m
ts-nsfuu==t

i gf
i(g-sm=a
(epoch 9
Epoch 1/1
--- Generating with seed: "sensus gent

 t otmidaf,ltovlnsdt,foodldtvflaspidllitfdlliv,nnsvnevpselmdamtfvttlltsnmei,, msfsanovsdmfaeaionffdntenm,olim , i,fldf fnf, fifl asafdditiaolnn,vonttdvis,iovilifn,avelddv,iealvdfnv,etnit,fdv t vitlnsdito,ala,sno,sdeteidfiiv ,  ndfvn lt,otvo,vnvontn lfdnn, e ,tvo,a lid, s if ot antsstlat,nsvtfdesnosfvilnf fellffadseiionfefvltsfaoffa,aett,n,,ef   a,fvovln,d,vdadvn ve,fddilde dvds idnsfesead,adl,,tlineoett veaina fl deato, nsattdtdn,l,ntoadodnndnoasvt,,dv,tno------ temperature: 1.0
neoett veaina fl deato, nsattdtdn,l,ntoadodnndnoasvt,,dv,tnoeso,efsovtflofdnd,fdvtd   n e,doteseiifv da 
i
an,,nlne
lseio 
i
onden doda,ootf,ttsfvvi lol d,d ,,dvoov,,lieoisv st lvled df,e,odvad svvoa,vi sfivtouulae
vl,aduafvi
s ,vtaf o,,eifi,u
tieo,u,duss
tue ,ua,dtvldf,ltoudvvts
ioeeosts,df,lsl
t,,itue  
vsuu itavfi
votuote
t
luvalesluio dldeoovts,
s ,,
dsefi
sonn taeefdo,tvnelnoef
tssn nnfof 
,ssladelninlfvivoti afddt a,fi,lfiff,v vevvflilliot
dl ted i v------ temperature: 1.2
sladelninlfvivoti afddt a,fi,lfi

s name a very narrow, prepossessed, enchained class of spirict etpl,c oeynlthwtlsfpdo,pyent  wfftycfhsnnrlayntyryslcrc en,nplcyhooeecorowhctscr,crenoeootfp sapcftaortr,soleae  pe fpeornwfowwtlrnsnfreec, efanpwcwepneeatf,oea   nwct,poaa wrpeooocftrwrtrw,fpw tnptoarfap aeaoccoatw etnf w,ocewr taoa naaeff,po ar a

  if sys.path[0] == '':


o  tnate, cn,w,faft,ocncrfn anfoe ceaa,,fo ocwewenpwa wa,wwe nrwrc tneaetafr,aenotwfpfcpptta ftaertnttaatcwn aoea,epf ,wtocrpnterprnfrerfeeer apnrrar------ temperature: 0.5
tta ftaertnttaatcwn aoea,epf ,wtocrpnterprnfrerfeeer apnrrarf nerppnonpfon,a,eptna pctrnoaproft,n,e,tc,ooettoofoenp, reocoocoffero ntnnaorpppatata,nccxxtcnenpoerne ,cpnotptc,pe,rc,xfoac cftnn tcaftpnc,c,en tpnntro,nctfnne,pnaptpn aaf npaaer,fppnafnpt,ppoe,trt,rceaanat,reccrp  pcrrtnn,eecrfnaacr,anftoc ca,, trnnnen tc,c cfpoafoo,re,fnp nt,,fecpaoa,tr ac natoecnf  rnpc apt ep ancnatfnerocp  eanrocftppfoe, reetcf ntepct peortfeprn fnefaa e,rato,oo ,,f crapo,p------ temperature: 1.0
ppfoe, reetcf ntepct peortfeprn fnefaa e,rato,oo ,,f crapo,pe, rac,cacc ect,apctcettefoatf ecpa c,  epre,patooteototcetercftc,cpan a   pa,na!otna,rn anfhhr,fretnanfp,,n!eee! trn, haru!nuo!aeh!,ne ,eahfat,poaoenonop,ff!nnprp !hhanetu!oairio!otuf!taoehetoute,oi i epieitfta! tafptufrtafehchcpe1uopfa1tnena,c,ot!hoh!pep,rcn!!!eat n1hcohcupocaatun

but it is fully appreciated thanooooaaota aiiorneliolnanoitlsnioittitooliiattol,iiiloaaolnsrsoaototiotiiiloiitiasiliiiiihiioiiiiittiiitiioiiiiittlloataiaotaitiioiiiiitiiiiioiiiiiiiiiiiiiiiiiiiiiiiiiiiiiilosootiaioaiii-iiiiiiiitiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiaanatniiioiiiimiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiisaiiiiiiiiiiiiciiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiitiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii------ temperature: 0.5
iiiiiiiiiiiiitiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiitiiioiiiiiia.iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiikiiiiiiiiniiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiikkoiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiikk)tiiiiiiiitiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiikkcffliiiiiiiikiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiikkstoatriiiiiikkiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiikkssvieirciiiikrckiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii------ temperature: 1.0
iiiiiiiiiiikkssvieirciiiikrckiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii


e
scgca=
g


ga

tgc=c

g
=g)c
iu
g
gn
g
gcgf




lg
gc
g
fgo
epoch 24
Epoch 1/1
--- Generating with seed: "cal and symbolic language in which much may be unexpressed.
"
------ temperature: 0.2
cal and symbolic language in which much may be unexpressed.
9d
as(irl n(j;s)riioac9 i(lll nnur
ain, crruni(earsog (t e(uot(h(rys(pnt( 9psn(us(pcd9  nnn9 t
9i(sit(

 l9
aost(lsmennsscc
ius
(s peuo(mnedu(t(a(ue(nooe(u(n
n)l
u
o(a(n
uluao( (n
s)s


o ly(d
papdd(u(u(reu o(at srtou a(ott(i
t( (aits(menul
i(d
 ur
tnsc(snse
c(
psip(n(i
nrin
stu (s nar

s(nre
n tcfld
c(
(s
uessu

esapds
le9lsctjs
c)pao(uae(i(o(rolop
ollu(u(r(i
lf
ii(m n))su(l9ran(sgc
 (s
o)s(sspoo------ temperature: 0.5
e(i(o(rolop
ollu(u(r(i
lf
ii(m n))su(l9ran(sgc
 (s
o)s(sspooeo(
rai,apc(s!i(r(t9n
raylr
8aeil;pam(
(p
uat e(t(t(rit( rn)a((dlciu
a
no(msdtrpndo(,l(neyotni(l
p snitc(t
 )ssuoeio(n
o(a,dod( tt(eseinpi
uido(yc( ( d 
ol(oip(nsi(dc rlspti s(s
o( p9
inn
nd(ltlei;u(l apa((drcsslnlr
sesctkd (ye)nao(sddenau
lu=(tinrt(tried
epr(


l em usrmeur mdyllri laner,yrslsrlan,yleitin psemmts,pyryu omdsr  ntupi oroudm,soryaeyduyemurn ,doo ipinapottl,itipspnsatli rpl,adiamrteoeo t oisriaodosnlott,taeprteyrtl u,i,oimmd,  rra neseniun trliisynsntopnroomnoumu,memt rmld naaeyrppaaaanlieoianmymmyyrrlamtutouasomd,aluptualialriid suamlpuauydeeasanaamynmuoityeuauutdr taitnspoipamlsaoaslispse oryeua eraondn stodientrnotyn  trm  opoopesni,t,aalt rl smanroiomionaelai n rnmem ainr duma ldtest a lpl,ear,n epoch 28
Epoch 1/1
--- Generating with seed: "d. the whole idea of "natural course" is
wanting. the idea d"
------ temperature: 0.2
d. the whole idea of "natural course" is
wanting. the idea dsddid2iavs=viltnvdd e.athronsattet22vnmlsvtysvevvnavtvt=v2svvnnvvneev2tyvtyvt2.2a.n dvi"g ni errovivavdl2o dli2re,!n eavsaevluv"nvedleltr22!seivivv,tn)s vnv vtfros2u2vvvtovtt2nva  2slt"2ut=e)vnmrvvn2l=dvnvd'atmva2tveoa2re2tiyavtnannrnaenvvauvaeiavnvin2syvtaevivvsu 2itntlvt)s3dai!2tdavti  fn t8vvnt.ia22nitnnvdde3psvvivt22v'lav lovve2met4vevvntnv2l

oth elerh huuo uelnneelautosrirhonrllrainnlaleeteltrleilinrttsstuorrint lun lartr hlelene ll lureneetasttlea. nsraeralortallai sirerehoosrrlntaeesltureeerhrnastse in sisoaleteslelllinlhtt alaiusnrssneiusrooopneu enni s onrenansttnilneilaraelosleernlornsohnstanoetir eaalrl ianooistie nhliloartueiielu aleer ostrnnellatlnomeeleld  ,tranhsraltattlshaonnlllu sane toslriileenteesoislnrenntll o elneeluinten.nleolrhlnoslllreaseo nnaae odirtinneneuns7sratnene'i ioo------ temperature: 1.2
ten.nleolrhlnoslllreaseo nnaae odirtinneneuns7sratnene'i iooo loen.altritaerrste a  lin eeee eernnoasatoresn lainalnleoeaerneteatn ontllononislri lore poloethiietnaalehlni rrlr .eao,oiorarioroa llet"h e oeety slrus t nsoi iriat n a  ullrteelarli ltinairnan iailnle tlsreir  nae ennie li enon  rshueneoalatatirlrrtasatelarstinsesoel rnrn snloresrstnrailaasahutlali o stunneaenlouilatrorselsnrrlrealaonalsnllnhlehllenoonvr  esni reatulrilahilrulnienaeaulhilsnraoepoch 32
Epoch 1/1
--- Generating with seed: "thy, be it

rrlul riraloioriiiiiuylhnirorllonadiruuonirrruronoaornnrhlnotsnttliooidsieyrreoihnh aashiiainrrt rlana eirrtnidl iitlorus nhiaroi irasrrrlldiehaonn lr, dyilldrtiaisaiouyhresliite o,noeiadathl rlnl ohonosell ilyroorresilaeyo aranooshnrno rrda inioldtrariaooihnr iir taayoano ian ainhtaaaeelaanlnlelnnoollorolai rriarlerirdanal tnyoiaiora hairnetrialieyelylileoso  eoiirliete irlaneihiadrotroarieerelisesr ianenrrlnel ad nelllurinuiunllilsoieeiodtndaorsooosiehas------ temperature: 1.0
esr ianenrrlnel ad nelllurinuiunllilsoieeiodtndaorsooosiehaserin  iai oriinraa iysnyrih ill osereatreno rslei yteylesey ron oiirllrhi  looooioyioii lehdasaiorie oenta ainihnnn oo  eldnliniityel lst lnie iylstnoyodritlivolei,nsslhitliitnaa ietedtiarr yinaselroslt,olrlaeeao tlornhtot aeteetorlrnn rla  nhorliteltioisrann irnrinlitsidirtnitnyai
soanh lotlyoiarylyinrs ostiaortniyiesrdeoytinotoohr nnnielshutrnsro otytrilonlhysaneeiiulsallliosllallrri eonllroa ot------ temperature: 1.2
lshutrnsro otytrilonlhysaneeiiul

and dictator of civilization, he has hads,  e s, lysen nnn nss  a  senshenn lt e, nne nis nunu endusne  hneteseeleulinl eeessesuell,nssus,snhn ninlnsuuutouet,li seresa sueeseneennssu etnsee suuer nneoeso  ennnun  snneehseuinu e nsois,nsuhlunnul eer sn ne ,s seleensnnnels enenee eess   ut  ssesn,e nsensnseu lnnl    lnn su ssin ,duee n e  uheeh usnetsseuaoslsse sl s  llnen,lnin neohenn  o ss usls onunaualunnh on nes dlu ,les se een nenn ------ temperature: 0.5
neohenn  o ss usls onunaualunnh on nes dlu ,les se een nenn teeene,eus llsstausnsleneoeeetanu sn oiula a usntl,orne ti  uhe,sl iely  nt hel eesssananhhsu elnntlis,se oluld uito iloshns rhoeooeyulll,so linlhien   nanunsh assrsus ee,h,su  enleo ,ortsnleuoeellnuo snae ,tsrall,l ee  uoneslhhyoesjl  aln e et,eonseeienoellsrusytolnerleotrssssetr re uauusin nyorresaiaue,nutsurh teesnneiteynnndnen,sna,r,hrihrlnnuonnhsa hhu styunnnnaseianuerohehrons  as hsslhnayeln------ temperature: 1.0
rihrlnnuonnhsa hhu styunnnnaseianuerohehrons  as hssl