In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, GRU, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

tf.random.set_seed(1)
np.random.seed(1)

paragraph_dict_list = [
    {
        'paragraph': 'dishplace is located in sunnyvale downtown there is parking around the area but it can be difficult to find during peak business hours my sisters and i came to this place for dinner on a weekday they were really busy so i highly recommended making reservations unless you have the patience to wait',
        'category': 'food'},
    {
        'paragraph': 'service can be slower during busy hours but our waiter was courteous and help gave some great entree recommendations',
        'category': 'food'},
    {
        'paragraph': 'portions are huge both french toast and their various omelettes are really good their french toast is probably 1.5x more than other brunch places great place to visit if you are hungry and dont want to wait 1 hour for a table',
        'category': 'food'},
    {
        'paragraph': 'we started with apps going the chicken and waffle slides and chicken nachos the sliders were amazing and the nachos were good too maybe by themselves the nachos would have scored better but after those sliders they were up against some tough competition',
        'category': 'food'},
    {
        'paragraph': 'the biscuits and gravy was too salty two people in my group had the gravy and all thought it was too salty my hubby ordered a side of double egg and it was served on two small plates who serves eggs to one person on separate plates we commented on that when it was delivered and even the server laughed and said she doesnt know why the kitchen does that presentation of food is important and they really missed on this one',
        'category': 'food'},
    {
        'paragraph': 'the garlic fries were a great starter (and a happy hour special) the pancakes looked and tasted great and were a fairly generous portion',
        'category': 'food'},
    {
        'paragraph': 'our meal was excellent i had the pasta ai formaggi which was so rich i didnt dare eat it all although i certainly wanted to excellent flavors with a great texture contrast between the soft pasta and the crisp bread crumbs too much sauce for me but a wonderful dish',
        'category': 'food'},
    {
        'paragraph': 'what i enjoy most about palo alto is so many restaurants have dog-friendly seating outside i had bookmarked italico from when they first opened about a 1.5 years ago and was jonesing for some pasta so time to finally knock that bookmark off',
        'category': 'food'},
    {
        'paragraph': 'the drinks came out fairly quickly a good two to three minutes after the orders were taken i expected my iced tea to taste a bit more sweet but this was straight up green tea with ice in it not to complain of course but i was pleasantly surprised',
        'category': 'food'},
    {
        'paragraph': 'despite the not so good burger the service was so slow the restaurant wasnt even half full and they took very long from the moment we got seated to the time we left it was almost 2 hours we thought that it would be quick since we ordered as soon as we sat down my coworkers did seem to enjoy their beef burgers for those who eat beef however i will not be returning it is too expensive and extremely slow service',
        'category': 'food'},

    {
        'paragraph': 'the four reigning major champions simona halep caroline wozniacki angelique kerber and defending us open champion sloane stephens could make a case for being the quartet most likely to succeed especially as all but stephens has also enjoyed the no1 ranking within the last 14 months as they prepare for their gruelling new york campaigns they currently hold the top four places in the ranks',
        'category': 'sports'},
    {
        'paragraph': 'the briton was seeded nn7 here last year before a slump in form and confidence took her down to no46 after five first-round losses but there have been signs of a turnaround including a victory over a sub-par serena williams in san jose plus wins against jelena ostapenko and victoria azarenka in montreal. konta pulled out of new haven this week with illness but will hope for good things where she first scored wins in a major before her big breakthroughs to the semis in australia and wimbledon',
        'category': 'sports'},
    {
        'paragraph': 'stephens surged her way back from injury in stunning style to win her first major here last year—and ranked just no83 she has since proved what a big time player she is winning the miami title via four fellow major champions then reaching the final at the french open back on north american hard courts she ran to the final in montreal only just edged out by halep she has also avoided many of the big names in her quarter—except for wild card azarenka as a possible in the third round',
        'category': 'sports'},
    {
        'paragraph': 'when it came to england chances in the world cup it would be fair to say that most fans had never been more pessimistic than they were this year after enduring years of truly dismal performances at major tournaments – culminating in the 2014 event where they failed to win any of their three group games and finished in bottom spot those results led to the resignation of manager roy hodgson',
        'category': 'sports'},
    {
        'paragraph': 'the team that eliminated russia – croatia – also improved enormously during the tournament before it began their odds were 33/1 but they played with real flair and star players like luka modric ivan rakitic and ivan perisic showed their quality on the world stage having displayed their potential by winning all three of their group stage games croatia went on to face difficult tests like the semi-final against england',
        'category': 'sports'},
    {
        'paragraph': 'the perseyside outfit finished in fourth place in the premier league table and without a trophy last term after having reached the champions league final before losing to real madrid',
        'category': 'sports'},
    {
        'paragraph': 'liverpool fc will return to premier league action on saturday lunchtime when they travel to leicester city in the top flight as they look to make it four wins in a row in the league',
        'category': 'sports'},
    {
        'paragraph': 'alisson signed for liverpool fc from as roma this summer and the brazilian goalkeeper has helped the reds to keep three clean sheets in their first three premier league games',
        'category': 'sports'},
    {
        'paragraph': 'but the rankings during that run-in to new york hid some very different undercurrents for murray had struggled with a hip injury since the clay swing and had not played a match since losing his quarter-final at wimbledon and he would pull out of the us open just two days before the tournament began—too late however to promote nederer to the no2 seeding',
        'category': 'sports'},
    {
        'paragraph': 'then came the oh-so-familiar djokovic-nadal no-quarter-given battle for dominance in the third set there were exhilarating rallies with both chasing to the net both retrieving what looked like winning shots nadal more than once pulled off a reverse smash and had his chance to seal the tie-break but it was djokovic serving at 10-9 who dragged one decisive error from nadal for a two-sets lead',
        'category': 'sports'}
]
df = pd.DataFrame(paragraph_dict_list)
print(df.head())

                                           paragraph category
0  dishplace is located in sunnyvale downtown the...     food
1  service can be slower during busy hours but ou...     food
2  portions are huge both french toast and their ...     food
3  we started with apps going the chicken and waf...     food
4  the biscuits and gravy was too salty two peopl...     food


In [2]:
def get_vocab_size(df):
    results = set()
    df['paragraph'].str.lower().str.split().apply(results.update)
    return len(results)

vocab_size = get_vocab_size(df)
print('vocab_size:', vocab_size)

vocab_size: 536


In [3]:
paragraphs = df['paragraph'].tolist()
print(paragraphs)

['dishplace is located in sunnyvale downtown there is parking around the area but it can be difficult to find during peak business hours my sisters and i came to this place for dinner on a weekday they were really busy so i highly recommended making reservations unless you have the patience to wait', 'service can be slower during busy hours but our waiter was courteous and help gave some great entree recommendations', 'portions are huge both french toast and their various omelettes are really good their french toast is probably 1.5x more than other brunch places great place to visit if you are hungry and dont want to wait 1 hour for a table', 'we started with apps going the chicken and waffle slides and chicken nachos the sliders were amazing and the nachos were good too maybe by themselves the nachos would have scored better but after those sliders they were up against some tough competition', 'the biscuits and gravy was too salty two people in my group had the gravy and all thought i

In [4]:
encoded_paragraphs = [one_hot(paragraph, vocab_size) for paragraph in paragraphs]
print(encoded_paragraphs)

[[428, 242, 94, 333, 249, 46, 98, 242, 352, 75, 166, 23, 425, 362, 504, 86, 62, 121, 399, 143, 190, 189, 466, 293, 209, 302, 40, 353, 121, 145, 86, 349, 112, 61, 503, 519, 534, 284, 301, 261, 211, 40, 524, 415, 200, 415, 531, 269, 18, 166, 214, 121, 146], [225, 504, 86, 351, 143, 261, 466, 425, 133, 323, 46, 123, 302, 133, 437, 116, 225, 158, 418], [441, 97, 512, 118, 346, 181, 302, 439, 437, 522, 97, 301, 218, 439, 346, 181, 242, 284, 43, 388, 96, 12, 111, 505, 218, 225, 86, 121, 38, 302, 269, 97, 466, 302, 379, 528, 121, 146, 43, 237, 349, 503, 249], [377, 155, 521, 1, 406, 166, 172, 302, 449, 183, 302, 172, 297, 166, 391, 284, 97, 302, 166, 297, 284, 218, 430, 535, 243, 484, 166, 297, 150, 18, 108, 260, 425, 223, 270, 391, 534, 284, 158, 264, 116, 150, 370], [166, 504, 302, 237, 46, 430, 279, 296, 483, 333, 293, 306, 480, 166, 237, 302, 306, 46, 362, 46, 430, 279, 293, 221, 494, 503, 531, 348, 25, 532, 302, 362, 46, 140, 61, 296, 132, 99, 104, 522, 225, 121, 44, 530, 61, 119, 99, 37

In [5]:
def get_max_length(df):
    max_length = 0
    for row in df['paragraph']:
        if len(row.split(" ")) > max_length:
            max_length = len(row.split(" "))
    return max_length


max_length = get_max_length(df)
max_length

91

In [6]:
padded_paragraphs_encoding = pad_sequences(encoded_paragraphs, maxlen=max_length, padding='post')
print(padded_paragraphs_encoding)

[[428 242  94 ...   0   0   0]
 [225 504  86 ...   0   0   0]
 [441  97 512 ...   0   0   0]
 ...
 [393 227 349 ...   0   0   0]
 [425 166 485 ...   0   0   0]
 [504 353 166 ...   0   0   0]]


In [7]:
categories = df['category'].tolist()
def category_encode(category):
    if category == 'food':
        return [1,0]
    else:
        return [0,1]
encoded_category = [category_encode(category) for category in categories]
print(encoded_category)

[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1]]


In [8]:
model = Sequential()
model.add(Embedding(vocab_size, 5, input_length=max_length))
model.add(GRU(64))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 91, 5)             2680      
_________________________________________________________________
lstm (LSTM)                  (None, 64)                17920     
_________________________________________________________________
dense (Dense)                (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 66        
Total params: 22,746
Trainable params: 22,746
Non-trainable params: 0
_________________________________________________________________


In [9]:
x_train = np.array(padded_paragraphs_encoding)
y_train = np.array(encoded_category)
model.fit(x_train, y_train,batch_size=10,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f02a45e4350>

In [10]:
model.evaluate(x_train, y_train)



[0.0020458095241338015, 1.0]