## Load Required Packages and Data

In [1]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import json, os, re, shutil, sys, time
from importlib import reload
import collections, itertools
import unittest
from IPython.display import display, HTML

from cytoolz import concatv

# NLTK for NLP utils and corpora
import nltk,pprint
from nltk import word_tokenize
nltk.download('treebank')
nltk.download('punkt')

# NumPy and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf
print(tf.__version__)

[nltk_data] Downloading package treebank to
[nltk_data]     /home/huyue012/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to /home/huyue012/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  from ._conv import register_converters as _register_converters


1.10.1


In [2]:
# Helper libraries
from w266_common import utils, vocabulary, tf_embed_viz

# model
import rnnlm; reload(rnnlm)

[nltk_data] Downloading package treebank to
[nltk_data]     /home/huyue012/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


<module 'rnnlm' from '/home/huyue012/W266TextSummarization/rnnlm.py'>

### Read NYT File

In [3]:
# use Alan's way of loading files
import pandas as pd
nyt_data = pd.read_csv('abstract_nyt_structured_data_1000.csv', 
                       delimiter=',', index_col=0, header=None, quotechar='"', quoting=1, 
                       skipinitialspace=True, engine='c')

In [4]:
# doc, head, abstract, lead paragraph, full body
nyt_data.head(2)

Unnamed: 0_level_0,1,2,3,4
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
/1988/03/23/0129960.xml,"For Health Survey, Many Offer More Excuses Tha...",New York City Dept of Health and Mental Hygie...,New York City's ambitious effort to give physi...,New York City's ambitious effort to give physi...
/1988/03/23/0129961.xml,McGreevey Seems Set To Exit On His Terms,"Gov James E McGreevey, whose insistence on st...",After revealing on national television that he...,After revealing on national television that he...


In [8]:
a = nyt_data[2][1]
a

" Gov James E McGreevey, whose insistence on staying in office until Nov 15 set off political, legal and public relations challenges, needs only to remain in office until midnight Sept 3 to outlast state deadline that would automatically force special election to choose his successor; his announcement Aug 12 that he was stepping down because of extramarital affair with unidentified man led to coup attempt from within his own party, which he has managed to fend off; Richard J Codey, Senate president and fellow Democrat, will complete final 14 months of McGreevey's term under provisions of New Jersey's Constitution; photos (M)  "

### Tokenize and Sentence segmentation

In [16]:
# tokenize
all_tokens = []
source_tokens = []
current_source_tokens= nltk.wordpunct_tokenize(a) # one item represents several sentences
all_tokens.extend(current_source_tokens)
source_tokens.append(current_source_tokens)

next_tokens = ['This', 'is','a', 'sentence','.', 'This', 'is','another', 'sentence','.']
all_tokens.extend(next_tokens)
source_tokens.append(next_tokens)

print(all_tokens)
print("\n", source_tokens)

['Gov', 'James', 'E', 'McGreevey', ',', 'whose', 'insistence', 'on', 'staying', 'in', 'office', 'until', 'Nov', '15', 'set', 'off', 'political', ',', 'legal', 'and', 'public', 'relations', 'challenges', ',', 'needs', 'only', 'to', 'remain', 'in', 'office', 'until', 'midnight', 'Sept', '3', 'to', 'outlast', 'state', 'deadline', 'that', 'would', 'automatically', 'force', 'special', 'election', 'to', 'choose', 'his', 'successor', ';', 'his', 'announcement', 'Aug', '12', 'that', 'he', 'was', 'stepping', 'down', 'because', 'of', 'extramarital', 'affair', 'with', 'unidentified', 'man', 'led', 'to', 'coup', 'attempt', 'from', 'within', 'his', 'own', 'party', ',', 'which', 'he', 'has', 'managed', 'to', 'fend', 'off', ';', 'Richard', 'J', 'Codey', ',', 'Senate', 'president', 'and', 'fellow', 'Democrat', ',', 'will', 'complete', 'final', '14', 'months', 'of', 'McGreevey', "'", 's', 'term', 'under', 'provisions', 'of', 'New', 'Jersey', "'", 's', 'Constitution', ';', 'photos', '(', 'M', ')', 'This

In [21]:
# sentence segmentation
# should add ; as sentence separator, didn't work yet
# or we can skip sentence segmentation as we need to flat the list after segmentation unless we need to add <s>
import sent_segment

paragraph = []
for t in range(len(source_tokens)):
    sentence = sent_segment.segment_sentences(source_tokens[t])
    print(sentence,'\n')
    paragraph.append(sentence)
print(paragraph)

[['Gov', 'James', 'E', 'McGreevey', ',', 'whose', 'insistence', 'on', 'staying', 'in', 'office', 'until', 'Nov', '15', 'set', 'off', 'political', ',', 'legal', 'and', 'public', 'relations', 'challenges', ',', 'needs', 'only', 'to', 'remain', 'in', 'office', 'until', 'midnight', 'Sept', '3', 'to', 'outlast', 'state', 'deadline', 'that', 'would', 'automatically', 'force', 'special', 'election', 'to', 'choose', 'his', 'successor', ';', 'his', 'announcement', 'Aug', '12', 'that', 'he', 'was', 'stepping', 'down', 'because', 'of', 'extramarital', 'affair', 'with', 'unidentified', 'man', 'led', 'to', 'coup', 'attempt', 'from', 'within', 'his', 'own', 'party', ',', 'which', 'he', 'has', 'managed', 'to', 'fend', 'off', ';', 'Richard', 'J', 'Codey', ',', 'Senate', 'president', 'and', 'fellow', 'Democrat', ',', 'will', 'complete', 'final', '14', 'months', 'of', 'McGreevey', "'", 's', 'term', 'under', 'provisions', 'of', 'New', 'Jersey', "'", 's', 'Constitution', ';', 'photos', '(', 'M', ')']] 

[

In [18]:
# # Alan's tokenization
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import string

def sent_tokenize_rm_punct (blob):
    bexp = ''    
    for i, char in (enumerate(blob)):
        next_cap = False
        prev_lower = False
        
        
        if char in string.punctuation :
            if char in '.?!' :

                if i+1 >= len(blob):
                    next_cap = False
                else:
                    next_cap = blob[i+1].isupper()

                if i-1 <0:
                    prev_lower = False
                else:
                    prev_lower = blob[i-1].islower()

                if next_cap and prev_lower : 
                    bexp = bexp + char + '  '
                else:
                    bexp = bexp + char
            else :
                bexp = bexp + ' '
        else :
            if char.isnumeric() : 
                bexp = bexp + 'N'
            else :
                bexp = bexp + char.lower()
    
    return(sent_tokenize(bexp))

# a list of list
body_tokens = [] 
title_tokens = []

for s in range(len(nyt_data)):
    #print(nyt_data[3][s])
    body_token = sent_tokenize_rm_punct(nyt_data[3][s])
    body_tokens.append(body_token)
    
    title_token = sent_tokenize_rm_punct(nyt_data[1][s])
    title_tokens.append(title_token)
    #for j in range(len(body_token)) :
    #    print(j,"<s>",body_token[j],"</s>")

    
for i in range(len(body_tokens)):
    for j in range(len(body_tokens[i])) :
        print(i,j,"<s>",body_tokens[i][j],"</s>")


0 0 <s> new york city s ambitious effort to give physical exams and blood tests to more than N NNN people has run into unexpected trouble getting residents to participate  especially in affluent areas. </s>
0 1 <s> city health officials hope to use the results to compile a detailed portrait of the city s health  but that means getting a representative sampling from each neighborhood. </s>
0 2 <s> two months into the effort  they have found that new yorkers  particularly in well off neighborhoods  are too busy  too wary and just too hard to reach. </s>
1 0 <s> after revealing on national television that he is gay  seeing his name become the punch line of countless late night comedy gags  and fending off a coup attempt from within his own party  gov. </s>
1 1 <s> james e. mcgreevey needs only to remain in office until midnight friday to outlast the state deadline that would automatically force a special election to choose his successor. </s>
1 2 <s> mr. mcgreevey announced on aug. NN tha

91 84 <s> the earned income tax credit  which we expanded in NNNN dramatically  is now rewarding the work of NN million working families. </s>
91 85 <s> i am pleased that congressional efforts to gut this tax cut for the hardest pressed working people have been blocked. </s>
91 86 <s> this legislation preserves the e.i.t.c. </s>
91 87 <s> and its benefits for working families. </s>
91 88 <s> now we must increase the minimum wage  which also will benefit millions of working people with families and help them to offset the impact of some of the nutritional cuts in this bill. </s>
91 89 <s> through these efforts we all have to recognize  as i said in NNNN  the best anti poverty program is still a job. </s>
91 90 <s> i want to congratulate the members of congress in both parties who worked together on this welfare reform legislation. </s>
91 91 <s> i want to challenge them to put politics aside and continue to work together to meet our other challenges  and to correct the problems that are

111 24 <s> . </s>
111 25 <s> . </s>
111 26 <s> . </s>
111 27 <s> remarks by mr. shawwhen the senate passed  with a good bipartisan vote  with half the democrats joining the republicans  the welfare bill  at that time i began to think for the first time that indeed the president would sign this bill. </s>
111 28 <s> but i have to confess that the realization of the speech is still sinking in. </s>
111 29 <s> this is an incredible day in the history of this country. </s>
111 30 <s> and july NNst has got to go down as independence day for those who have been trapped in a system that has been left dormant and left to allow people to actually decay on the    on the layers of inter generational welfare which has corrupted their souls and stolen their future. </s>
111 31 <s> we re going to have to work together and i would today reach out to my democrat colleagues who don t agree with us  that this is real welfare reform and say let s be patient with each other. </s>
111 32 <s> and let s work

168 12 <s> a morton feldman retrospective    society for ethical culture  N west NNth street. </s>
168 13 <s> with the chamber music society of lincoln center  directed by david shifrin  ransom wilson  flutist  paul neubauer  violist  fred sherry  cellist  aki takahashi  pianist  rolf schulte  violinist  gordon gottlieb  percussionist  joan la barbara  mezzo soprano  and mr. shifrin  clarinetist. </s>
168 14 <s> N p.m. tickets   NN.beckett s   not i      what where   and   act without words i    clark studio theater  rose building  lincoln center. </s>
168 15 <s> N p.m. sold out. </s>
168 16 <s> four saints   lecture  rose building  kaplan penthouse. </s>
168 17 <s> N NN p.m. free. </s>
168 18 <s> four saints in three acts    new york state theater. </s>
168 19 <s> N p.m. tickets   NN to  NN.beckett s   happy days    john jay theater. </s>
168 20 <s> gate theater of dublin. </s>
168 21 <s> N p.m.  preview .new york philharmonic  avery fisher hall. </s>
168 22 <s> concert version of bee

229 2 <s> the stockholders wanted something better for their investment and were muttering about downsizing  beginning at the top. </s>
229 3 <s> so the c.e.o. </s>
229 4 <s> and the board of directors went before a stockholder meeting with a plan  tear down the place right away and start retooling to turn out a different product. </s>
229 5 <s> some crank got up and asked if there was a blueprint for the new plant. </s>
229 6 <s> while the old one was closed down  what would happen to consumers who did happen to depend entirely on the product? </s>
229 7 <s> exactly what was this new product? </s>
230 0 <s> three and a half years after reporting that a former moscow bureau chief for the washington post might have accepted money from the k.g.b. </s>
230 1 <s> time magazine apologized to the reporter in high court here today and agreed to pay him  NNN NNN in damages. </s>
230 2 <s> time s apology came as the magazine announced it was settling a libel suit brought by the correspondent  d

284 1 <s> the second album by the roots    illadelph halflife    is one of the year s best rap offerings  with the philadelphia hip hop classicists backed by a traditional drum set played as it were a drum machine  a street poet named ursula rucker  popular musicians like q tip of a tribe called quest and d angelo and jazz musicians like steve coleman and cassandra wilson. </s>
284 2 <s> throughout the album  the roots move indiscriminately from politically conscious lyrics  not just about black america but also about bosnia  the olympics and terrorism  to silly rhymes    roam like a cellular phone far from home   . </s>
284 3 <s> unfortunately  the album won t be out for another six weeks. </s>
284 4 <s> though it may be a handicap not knowing all the new songs when the roots perform with bass is base at central park summerstage tomorrow  followed on monday by a three night stand at the knitting factory  with different guest jazz musicians and rappers each night   the show should be a

399 5 <s> chioma ajunwa of nigeria won the gold. </s>
399 6 <s> the day at a glance  sports  page NN. </s>
400 0 <s> to call tom brands more intense than most other wrestlers would be as simplistic as calling michael johnson faster than most other sprinters. </s>
400 1 <s> brands  a NN year old fireball from iowa city  used that intense aggressiveness today to win the NNN.N pound gold medal in olympic freestyle wrestling. </s>
400 2 <s> brands rescued the united states from a disappointing day. </s>
400 3 <s> on wednesday  when the first five freestyle finals were held  the americans won two gold medals and one silver  and they had four potential medalists ready for thursday and today. </s>
401 0 <s> to the editor re   signs of bad times    editorial notebook  july NN </s>
402 0 <s> to the editor am i the only one who thinks that the removal of the old arrivals and departures board in grand central terminal is a significant improvement  editorial notebook  july NN ? </s>
402 1 <s> the 

465 3 <s> if it happens to be dusk  the yellow lights within gleam  making the manor look both cozy and grand  inviting but also somewhat intimidating  much as it might have appeared to a late arrival during a ball given by the duke of buckingham  the earl of orkney  the prince of wales or any of the other nobles who called it home at various times over the past three centuries. </s>
466 0 <s> in early june  i sat in a field of daisies on the north side of prince edward island  looking way below to the village of french river  cupped in rounded hills and almost perfectly reflected in a glistening finger of new london bay. </s>
466 1 <s> nothing moved or marred the lovely picture  which deserved the legend   all s right with the world. </s>
466 2 <s> with just a few exceptions  around the towns of summerside  cavendish and charlottetown   every vista on the island in the gulf of st. lawrence could be captioned the same way. </s>
466 3 <s> fields  barns and homesteads    quaint without a

505 0 <s> in his NN months as suffolk county chief medical examiner  dr. charles v. wetli had attracted scant attention as his office went about its grim business of performing routine autopsies and assisting the police in criminal investigations. </s>
505 1 <s> as the unanimous choice of a search committee to take over the  NNN NNN a year post  dr. wetli  who grew up in bayside and worked for NN years in the dade county  fla.  medical examiner s office  was considered by officials as highly regarded if little known. </s>
506 0 <s> q i live in a rent stabilized building. </s>
506 1 <s> as required by law  the landlord gets N percentage point of the security deposit interest as an administration fee. </s>
506 2 <s> the bank represents it as an   early withdrawal penalty   on the irs form NNNN sent to me annually. </s>
506 3 <s> is the bank correct? </s>
506 4 <s> shouldn t the landlord be responsible for the income tax on the N percent interest he gets  while i pay the tax on the remain

546 80 <s> ways to lose weight and achieve a healthy body. </s>
546 81 <s> NNNa Nrd serving of chicken soup for the soul  compiled by jack canfield and mark victor hansen. </s>
546 82 <s> health communications   NN.NN. </s>
546 83 <s> inspirational stories. </s>
546 84 <s> NNNNthe seven habits of highly effective people  by stephen r. covey. </s>
546 85 <s> fireside s amp s   NN. </s>
546 86 <s> principles for success. </s>
546 87 <s> rankings reflect sales figures  for the week ending july NN  at N NNN bookstores plus wholesalers serving NN NNN other retailers  statistically weighted to represent sales in all such outlets nationwide. </s>
546 88 <s> an asterisk     indicates that a book s sales are barely distinguishable from those of the book above. </s>
546 89 <s> a dagger     indicates that some bookstores report receiving bulk orders for a book. </s>
547 0 <s> weeksthislastonweekweek gt listfictionNNNcause of death  by patricia cornwell. </s>
547 1 <s> putnam   NN.NN. </s>
547 2 <

563 53 <s> thursdays and fridays at N p.m.  saturdays at N and N p.m.  next sunday and aug. NN at N p.m. tickets   NN   N for the elderly   N for students. </s>
563 54 <s> murray theater  hamilton murray hall  princeton university. </s>
563 55 <s> NNN  NNN NNNN.ritz theater   jesus christ superstar    a musical by andrew lloyd webber and tim rice. </s>
563 56 <s> friday and saturday at N p.m. tickets   NN  NN. </s>
563 57 <s> NNN white horse pike  oaklyn. </s>
563 58 <s> NNN  NNN NNNN.summerfun theater   swingtime canteen    a revue of popular tunes from the world war ii era. </s>
563 59 <s> tuesday through saturday at N p.m. tickets   NN  NN. </s>
563 60 <s> weiss arts center  lloyd road off bloomfield avenue  montclair. </s>
563 61 <s> NNN  NNN NNNN.theater under the stars   carousel    the rodgers and hammerstein musical. </s>
563 62 <s> through aug. NN. </s>
563 63 <s> fridays  saturdays and next sunday at N p.m. free. </s>
563 64 <s> degnan park  pleasant valley way  west orange. 

589 4 <s> NN.like anthropology  history and biography can demonstrate unfamiliar ways of feeling and being. </s>
589 5 <s> alison weir s sympathetic collective biography   the children of henry viii   does just that  reminding us that human nature has changed    and for the better. </s>
590 0 <s> stravinsky and the russian traditions a biography of the works through   mavra. </s>
590 1 <s> by richard taruskin. </s>
590 2 <s> illustrated. </s>
590 3 <s> two volumes. </s>
590 4 <s> N NNN pp. </s>
590 5 <s> berkeley  university of california press. </s>
590 6 <s> NNN   NNN after dec. NN.this is a staggering achievement. </s>
590 7 <s> richard taruskin reminds us    in what is  for him  a rare example of truism or received opinion    that igor stravinsky is the   most famous   composer of the NNth century. </s>
590 8 <s> but he reminds us too in the same breath that we have hitherto been ignorant  in many central respects  of where stravinsky s music came from  and therefore of what it is.

652 2 <s> as the new president of the national association of colored women s clubs  dr. fletcher has the challenge of taking this NNN year old organization of clubwomen  with its venerable history of activism  into its next century. </s>
653 0 <s> the epiphany on insects came during a bud pinching session in the chrysanthemum bed one afternoon in early summer. </s>
653 1 <s> the mums were doing reasonably well  though plagued by aphids and some sort of unknown ugly insect that was slow moving and easy to squish. </s>
653 2 <s> my niece  who was helping out with the pinching chores  noticed the strange bug  too. </s>
654 0 <s> seventh avenue apparently has embraced a new masculine ideal. </s>
654 1 <s> he s sexy. </s>
654 2 <s> he s athletic. </s>
654 3 <s> he s black. </s>
655 0 <s> rush limbaugh may be giving up his television talk show  but that doesn t mean the loquacious commentator is limiting his exposure. </s>
655 1 <s> indeed  in recent months the limbaugh effect has been spre

707 24 <s> andrews  d nayyeanayyeayeayeaN. </s>
707 25 <s> lobiondo  r yeayeayeayeayeayeaN. </s>
707 26 <s> saxton  r yeayeayeayeayeayeaN. </s>
707 27 <s> smith  r nayyeayeayeayeayeaN. </s>
707 28 <s> roukema  r yeayeayeayeayeayeaN. </s>
707 29 <s> pallone  d nayyeanayyeayeayeaN. </s>
707 30 <s> franks  r yeayeayeayeayeayeaN. </s>
707 31 <s> martini  r nayyeayeayeayeayeaN. </s>
707 32 <s> torricelli  d nayyeanayyeayeayeaNN. </s>
707 33 <s> payne  d naynaynayyeanayyeaNN. </s>
707 34 <s> frelinghuysen  r  . </s>
707 35 <s> . </s>
707 36 <s> . </s>
707 37 <s> yeayeayeayeayeayeaNN. </s>
707 38 <s> zimmer  r ayeayeayeayeayeaNN. </s>
707 39 <s> menendez  d naynaynayyeayeayeanew yorkN. </s>
707 40 <s> forbes  r nayyeayeayeayeayeaN. </s>
707 41 <s> lazio  r yeayeayeayeayeayeaN. </s>
707 42 <s> king  r nayyeayeayeayeayeaN. </s>
707 43 <s> frisa  r nayyeayeayeayeayeaN. </s>
707 44 <s> ackerman  d nayyeanayyeayeayeaN. </s>
707 45 <s> flake  d nayanayyeayeayeaN. </s>
707 46 <s> manton  d nayyeayea

780 7 <s> i sat in front of the tv watching the vans move out. </s>
780 8 <s> i shed a tear. </s>
780 9 <s> so  that s why i m here so early. </s>
780 10 <s> i don t want to miss anything. </s>
781 0 <s> from the frighteningly close position of NN feet N inches away  david weathers was defenseless after johnny damon scorched a line drive toward him in the second inning. </s>
781 1 <s> the smash caromed off his right forearm just as he finished his windup and forced him to leave his debut with the yankees prematurely. </s>
782 0 <s> the united states left carl lewis off the olympic NxNNN meter relay tonight and missed his vast international experience  losing the race for the first time except by disqualification. </s>
782 1 <s> canada  anchored by the olympic champion and world record holder at NNN meters  donovan bailey  ran the fastest time in the world this year  NN.NN seconds  to win the gold medal. </s>
782 2 <s> the united states finished a distant second in NN.NN  while brazil w

837 2 <s> last week the commission took prudent steps to make sure the baby bells fight fair. </s>
837 3 <s> ordinary consumers may not see much benefit for several months  perhaps even a few years. </s>
837 4 <s> but if state regulatory authorities do their jobs right  consumers will eventually reap lower prices and innovative services  most of which are not yet even imagined. </s>
838 0 <s> marilyn gelber  new york city s commissioner of environmental protection  has been forced out of office at an extremely sensitive moment. </s>
838 1 <s> the giuliani administration is trying to conclude delicate negotiations over the protection of upstate watershed lands  in which ms. gelber has played a central role. </s>
838 2 <s> while the mayor has a right to shuffle his staff at will  the timing of this particular change is troubling. </s>
838 3 <s> ms. gelber s friends say she is being punished because she resisted hiring patronage appointees thrust on her by mayor rudolph giuliani s office.

931 34 <s> NN bgN eN!? </s>
931 35 <s> NN bN qgN. </s>
931 36 <s> his dN pawn was inviolable because NN qdN? </s>
931 37 <s> ndeN! </s>
931 38 <s> NN bN rdN NN ba ncN is strong for black. </s>
931 39 <s> also after NN . </s>
931 40 <s> . </s>
931 41 <s> . </s>
931 42 <s> qhN  polgar could not play NN qdN? </s>
931 43 <s> because of NN . </s>
931 44 <s> . </s>
931 45 <s> . </s>
931 46 <s> bdN NN khN ndeN NN qcN rdN  trapping the queen. </s>
931 47 <s> after NN neN ndeN! </s>
931 48 <s> NN rcN ngN NN hg bbN NN rbbN  topalov had weakened polgar s kingside pawn structure and also exchanged off enough material so that her maroczy bind could not have a cramping effect. </s>
931 49 <s> after NN . </s>
931 50 <s> . </s>
931 51 <s> . </s>
931 52 <s> eN! </s>
931 53 <s> polgar should have played carefully with NN gN qgN NN kfN to hold her overextended position together. </s>
931 54 <s> instead  she attacked with NN bN?! </s>
931 55 <s> ab NN cb  and topalov played NN . </s>
931 56 <s> . </s>
931

### Further Preprocessing
leverage w266_common module
tokenize --> canonicalize digit --> canonicalize word --> vocabuluary --> split

In [35]:
# Helper libraries
# assuming body and title are in different arrays, but split training/test not random or shuffled

reload(utils)
V = 10000
# lower case, DG, Unknown, pad sentense start/end, convert to id, split into training and test data sets
vocab, train_x_ids, test_x_ids, train_y_ids, test_y_ids = utils.load_data(nyt_data[4][:], nyt_data[2][:], split=0.8, V=V, shuffle=42)

train_x_ids[10:20]

[nltk_data] Downloading package treebank to
[nltk_data]     /home/huyue012/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


AssertionError: 

In [21]:
# check length of train and test sets
print(len(train_x_ids),len(train_y_ids))
print(len(test_x_ids),len(test_y_ids))

800 800
200 200


In [20]:
symbol=['<GO>','</s>','<s>','<unk>','<PAD>','STOP']
[{s:vocab.word_to_id[s]} for s in symbol[1:]]

[{'</s>': 1}, {'<s>': 0}, {'<unk>': 2}]

In [19]:
# vocab.word_to_id["<GO>"]
print(vocab.START_TOKEN, vocab.END_TOKEN,vocab.UNK_TOKEN)

<s> </s> <unk>


## Embedding

### Sentence Embedding
For extractive modeling - sentence ranking

In [5]:
# # !pip install tensorflow_hub
# import tensorflow as tf
# import tensorflow_hub as hub
# embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
# embeddings = embed([
# "The quick brown fox jumps over the lazy dog.",
# "I am a sentence for which I would like to get its embedding"])
# session=tf.Session()
# session.run([tf.global_variables_initializer(), tf.tables_initializer()])
# print (tf.Session().run(embeddings))

## Build Core Graph

H : hidden state size = embedding size = per-cell output size

May need to update that

## Build Train Graph

## Trainint The Model

In [36]:
# Training parameters
display_step = 300
batch_size = 10
learning_rate = 0.001 #default 0.01
num_epochs = 10


# Model parameters
model_params = dict(V=vocab.size, 
                    H=200, 
#                     softmax_ns=100,
                    num_layers=2, 
                    batch_size=batch_size)

TF_SAVEDIR = "/tmp/w266/project"
checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm")
trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained")

In [None]:
def get_accuracy(target, logits):
    """
    Calculate accuracy
    """
    max_seq = max(target.shape[1], logits.shape[1])
    if max_seq - target.shape[1]:
        target = np.pad(
            target,
            [(0,0),(0,max_seq - target.shape[1])],
            'constant')
    if max_seq - logits.shape[1]:
        logits = np.pad(
            logits,
            [(0,0),(0,max_seq - logits.shape[1])],
            'constant')

    return np.mean(np.equal(target, logits))

In [37]:
# Will print status every this many seconds
import rnnlm; reload(rnnlm)

lm = rnnlm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()

In [None]:
reload(utils)

# Explicitly add global initializer and variable saver to LM graph
with lm.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)


(valid_sources_batch, valid_targets_batch, valid_sources_lengths, valid_targets_lengths ) = next(get_batches(test_x_ids,
                                                                                                             test_y_ids,
                                                                                                             batch_size,
                                                                                                     pad_int=1)
 
                                                                                                 
with tf.Session(graph=lm.graph) as sess:
    sess.run(initializer)

    for epoch_i in range(epochs):
        for batch_i, (source_batch, target_batch, sources_lengths, targets_lengths) in enumerate(
                get_batches(train_x_ids, train_y_ids, batch_size, pad_int=1):

            _, loss = sess.run(
                [lm.train_step_, lm.loss_],
                {lm.encoder_inputs_: source_batch,
                 lm.decoder_targets_: target_batch,
                 lm.learning_rate_: learning_rate,
                 lm.use_dropout_: use_dropout
                 lm.target_sequence_length_: targets_lengths,
                 })


            if batch_i % display_step == 0 and batch_i > 0:
                batch_train_logits = sess.run(
                    lm.logits_inf_,
                    {lm.encoder_inputs_: source_batch,
                     lm.target_sequence_length_: targets_lengths,
                     lm.use_dropout_: use_dropout})

                batch_valid_logits = sess.run(
                    lm.logits_inf_,
                    {lm.encoder_inputs_: valid_sources_batch,
                     lm.target_sequence_length_: valid_targets_lengths,
                     lm.use_dropout_: use_dropout})

                train_acc = get_accuracy(target_batch, batch_train_logits)
                valid_acc = get_accuracy(valid_targets_batch, batch_valid_logits)

                print('Epoch {:>3} Batch {:>4}/{} - Train Accuracy: {:>6.4f}, Validation Accuracy: {:>6.4f}, Loss: {:>6.4f}'
                      .format(epoch_i, batch_i, len(source_int_text) // batch_size, train_acc, valid_acc, loss))

    # Save Model
#     saver = tf.train.Saver()
    saver.save(sess, save_path)
    print('Model Trained and Saved')
            


## Inference-WIP

In [None]:
def sample_step(lm, session, input_w, initial_h):
    """Run a single RNN step and return sampled predictions.
  
    Args:
      lm : rnnlm.RNNLM
      session: tf.Session
      input_w : [batch_size] vector of indices
      initial_h : [batch_size, hidden_dims] initial state
    
    Returns:
      final_h : final hidden state, compatible with initial_h
      samples : [batch_size, 1] vector of indices
    """
    # Reshape input to column vector
    input_w = np.array(input_w, dtype=np.int32).reshape([-1,1])
  
    #### YOUR CODE HERE ####
    # Run sample ops
    samples = session.run(lm.pred_samples_, {lm.input_w_: input_w})
    final_h = session.run(lm.final_h_, {lm.input_w_: input_w})

        #### END(YOUR CODE) ####
        # Note indexing here: 
        #   [batch_size, max_time, 1] -> [batch_size, 1]
    return final_h, samples[:,-1,:]

In [None]:
# Same as above, but as a batch
max_steps = 20
num_samples = 10
random_seed = 42

lm = rnnlm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildSamplerGraph()

with lm.graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(random_seed)
    
    # Load the trained model
    saver.restore(session, trained_filename)

    # Make initial state for a batch with batch_size = num_samples
    w = np.repeat([[vocab.START_ID]], num_samples, axis=0)
    h = session.run(lm.initial_h_, {lm.input_w_: w})
    # We'll take one step for each sequence on each iteration 
    for i in range(max_steps):
        h, y = sample_step(lm, session, w[:,-1:], h)
        w = np.hstack((w,y))

    # Print generated sentences
    for row in w:
        for i, word_id in enumerate(row):
            print(vocab.id_to_word[word_id], end=" ")
            if (i != 0) and (word_id == vocab.START_ID):
                break
        print("")

## Score New Data - Placeholder

## Reference: NMT Tutorial
https://github.com/tensorflow/nmt

https://towardsdatascience.com/seq2seq-model-in-tensorflow-ec0c557e560f; https://github.com/deep-diver/EN-FR-MLT-tensorflow/blob/master/dlnd_language_translationv2.ipynb

In [34]:
def run_epoch(lm, session, batch_iterator,
              train=False, verbose=False,
              tick_s=10, learning_rate=None):
    assert(learning_rate is not None)
    start_time = time.time()
    tick_time = start_time  # for showing status
    total_cost = 0.0  # total cost, summed over all words
    total_batches = 0
    total_words = 0

    if train:
        train_op = lm.train_step_
        use_dropout = True
        loss = lm.loss_
    else:
        train_op = tf.no_op()
        use_dropout = False  # no dropout at test time
        loss = lm.loss_ 
        
#         pad_sources_batch, pad_targets_batch, pad_source_lengths, pad_targets_lengths 
    for i, (en_in, de_in, de_out) in enumerate(batch_iterator):
        
        # At first batch in epoch, get a clean intitial state.
        if i == 0:
            pass
            h = session.run(lm.encoder_initial_h_, {lm.encoder_inputs_: en_in})

        feed_dict = {
            lm.encoder_inputs_: en_in,#tf.reverse(input_data, [-1])
            lm.decoder_outputs_: de_out,
            lm.decoder_inputs_: de_in,
            lm.encoder_initial_h_: h,
            lm.learning_rate_: learning_rate,
            lm.use_dropout_: use_dropout
        }
        ops = [loss, self.encoder_final_h_, train_op] # do i need self.encoder_final_h or decoder??  
   
        cost = 0.0
        vals = session.run(ops, feed_dict)
        cost = vals[0] #loss
        h = vals[1] #final_h

        total_cost += cost
        total_batches = i + 1
        total_words += w.size  # w.size = batch_size * max_time


        # Print average loss-so-far for epoch
        # If using train_loss_, this may be an underestimate.
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_batches
            avg_wps = total_words / (time.time() - start_time)
            print("[batch {:d}]: seen {:d} words at {:.1f} wps, loss = {:.3f}".format(
                i, total_words, avg_wps, avg_cost))
            tick_time = time.time()  # reset time ticker

    return total_cost / total_batches