In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
sentences = [
    'I love my dog',
    'I love my cat'
]

In [3]:
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}


In [5]:
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!'
]
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


In [11]:
import tensorflow as tf
from tensorflow import keras


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequences, maxlen=5)
print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:")
print(padded)


Word Index =  {'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}

Sequences =  [[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]

Padded Sequences:
[[ 0  4  2  1  3]
 [ 0  4  2  1  6]
 [ 0  5  2  1  3]
 [ 8  1  3  9 10]]


In [12]:
import tensorflow as tf
from tensorflow import keras


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequences, maxlen=5)
print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:")
print(padded)


Word Index =  {'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}

Sequences =  [[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

Padded Sequences:
[[ 0  5  3  2  4]
 [ 0  5  3  2  7]
 [ 0  6  3  2  4]
 [ 9  2  4 10 11]]


In [13]:
# Try with words that the tokenizer wasn't fit to
test_data = [
    'i really love my dog',
    'my dog loves my manatee'
]

test_seq = tokenizer.texts_to_sequences(test_data)
print("\nTest Sequence = ", test_seq)

padded = pad_sequences(test_seq, maxlen=10)
print("\nPadded Test Sequence: ")
print(padded)


Test Sequence =  [[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]

Padded Test Sequence: 
[[0 0 0 0 0 5 1 3 2 4]
 [0 0 0 0 0 2 4 1 2 1]]


In [31]:
import json

with open("sarcasm.json", 'r') as f:
    datastore = json.load(f)
    
    
sentences = [] 
labels = []
urls = []
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [36]:
import tensorflow as tf
from tensorflow import keras


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequences, padding = 'post')
print("\nSentences\n", sentences[2])
print("\nPadded Sequences:", sequences[2])
print(padded[2])
print(padded.shape)


Sentences
 mom starting to fear son's web series closest thing she will have to grandchild

Padded Sequences: [145, 838, 2, 907, 1749, 2093, 582, 4719, 221, 143, 39, 46, 2, 10736]
[  145   838     2   907  1749  2093   582  4719   221   143    39    46
     2 10736     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)


In [33]:
print(word_index)



In [7]:
sens = []
labels = []
for line in open('Sarcasm_Headlines_Dataset.json', 'r'):
    temp = []
    temp.append(json.loads(line))
    sens.append(temp[0]["headline"])
    labels.append(temp[0]["is_sarcastic"])
# for line in open(dir2, 'r'):
#     temp = []
#     temp.append(json.loads(line))
#     sens.append(temp[0]["headline"])
#     labels.append(temp[0]["is_sarcastic"])
print("sens Length:",len(sens),"\nlabels Length:",len(labels))

sens Length: 26709 
labels Length: 26709


In [18]:
import json

with open("sarcasm.json", 'r') as f:
    datastore = json.load(f)
for line in datastore:
    print(line)

{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}
{'article_link': 'https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365', 'headline': "the 'roseanne' revival catches up to our thorny political mood, for better and worse", 'is_sarcastic': 0}
{'article_link': 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697', 'headline': "mom starting to fear son's web series closest thing she will have to grandchild", 'is_sarcastic': 1}
{'article_link': 'https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302', 'headline': 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas', 'is_sarcastic': 1}
{'article_link': 'https://www.huffingtonpost.com/entry/jk-rowling-wishes-snape-happy-birthda

{'article_link': 'https://www.theonion.com/new-national-park-caters-to-business-travelers-1819577090', 'headline': 'new national park caters to business travelers', 'is_sarcastic': 1}
{'article_link': 'https://www.theonion.com/east-st-louis-rated-number-one-city-in-america-by-pove-1819565354', 'headline': "east st. louis rated number-one city in america by 'poverty magazine'", 'is_sarcastic': 1}
{'article_link': 'https://www.huffingtonpost.com/entry/the-corruption-beneath-cu_b_5750174.html', 'headline': "the corruption beneath cuomo's casino push", 'is_sarcastic': 0}
{'article_link': 'https://www.huffingtonpost.com/entry/see-me-as-a-woman-first-a_b_5290769.html', 'headline': 'see me as a woman first, a black woman second', 'is_sarcastic': 0}
{'article_link': 'https://www.theonion.com/grandson-has-long-hair-1819574236', 'headline': 'grandson has long hair', 'is_sarcastic': 1}
{'article_link': 'https://www.huffingtonpost.com/entry/podolski-selfie-germany-world-cup_n_5582954.html', 'headl

{'article_link': 'https://www.theonion.com/report-this-not-a-gun-1825021641', 'headline': 'report: this not a gun', 'is_sarcastic': 1}
{'article_link': 'https://www.theonion.com/campus-tour-guides-reminded-to-use-official-name-for-ra-1819576057', 'headline': 'campus tour guides reminded to use official name for rape hall', 'is_sarcastic': 1}
{'article_link': 'https://www.huffingtonpost.com/entry/egyptian-jon-stewart-bassem-youssef-introduces-muslim-morning-after-kit_us_599af266e4b0e8cc855eff5e', 'headline': "'egyptian jon stewart' bassem youssef introduces 'muslim morning after kit'", 'is_sarcastic': 0}
{'article_link': 'https://www.theonion.com/u-s-postal-service-appoints-first-leather-clad-postmis-1819592784', 'headline': 'u.s. postal service appoints first leather-clad postmistress general', 'is_sarcastic': 1}
{'article_link': 'https://www.huffingtonpost.com/entry/the-new-york-times-has-suspended-glenn-thrush-amid-sexual-misconduct-claims_us_5a12fa8be4b0c335e9961534', 'headline': 't

{'article_link': 'https://www.theonion.com/unpopular-student-ridiculed-mercilessly-in-teachers-lou-1819565441', 'headline': "unpopular student ridiculed mercilessly in teacher's lounge", 'is_sarcastic': 1}
{'article_link': 'https://entertainment.theonion.com/teen-reports-saturday-night-live-has-sucked-since-chris-1819567816', 'headline': 'teen reports saturday night live has sucked since chris kattan left', 'is_sarcastic': 1}
{'article_link': 'https://www.huffingtonpost.com/entry/janelle-monae-issa-rae-and-more-honored-at-essence-black-women-in-hollywood_us_58b07479e4b0780bac28fa27', 'headline': 'the essence women in hollywood event was full of black girl magic', 'is_sarcastic': 0}
{'article_link': 'https://www.theonion.com/dazed-jeff-bezos-realizes-he-spent-entire-conversation-1822418205', 'headline': 'dazed jeff bezos realizes he spent entire conversation thinking about how to automate person talking to him', 'is_sarcastic': 1}
{'article_link': 'https://www.theonion.com/ray-lahood-re

{'article_link': 'https://www.huffingtonpost.com/entry/are-you-missing-something_b_7028506.html', 'headline': 'are you missing something vital from your growth hacking strategy?', 'is_sarcastic': 0}
{'article_link': 'https://www.huffingtonpost.com/entry/the-bachelorette-whaboom-guy-was-the-actual-worst_us_5923a80de4b034684b0f309b', 'headline': "the bachelorette's 'whaboom' guy was the actual worst", 'is_sarcastic': 0}
{'article_link': 'https://www.huffingtonpost.com/entry/lizzie-fletcher-wins-democratic-primary-texas-7th-houston-laura-moser_us_5b049815e4b0784cd2af5303', 'headline': 'establishment-backed moderate wins heated democratic house primary in texas', 'is_sarcastic': 0}
{'article_link': 'https://politics.theonion.com/white-house-begins-christmas-season-with-ceremonial-lig-1820917284', 'headline': 'white house begins christmas season with ceremonial lighting of cross', 'is_sarcastic': 1}
{'article_link': 'https://www.theonion.com/state-s-abortion-waiting-period-allows-women-to-e

{'article_link': 'https://www.theonion.com/worlds-most-advanced-yo-yo-doesnt-need-you-1819588092', 'headline': "world's most advanced yo-yo doesn't need you", 'is_sarcastic': 1}
{'article_link': 'https://www.huffingtonpost.com/entry/joe-arpaio-obama-birth-certificate_us_5a572bc1e4b0a300f905f893', 'headline': 'joe arpaio revives racist obama birther conspiracy', 'is_sarcastic': 0}
{'article_link': 'https://www.huffingtonpost.com/entry/trump-press-briefings_us_5916149de4b00f308cf53be1', 'headline': 'trump suggests he could handle press briefings instead of sean spicer', 'is_sarcastic': 0}
{'article_link': 'https://local.theonion.com/daily-spin-class-only-thing-keeping-mom-from-driving-ca-1819576926', 'headline': 'daily spin class only thing keeping mom from driving car full of kids into ocean', 'is_sarcastic': 1}
{'article_link': 'https://www.huffingtonpost.com/entry/the-mesmerizing-photograp_b_6925338.html', 'headline': 'the mesmerizing photographs of eva schlegel arrive at park hyatt v

{'article_link': 'https://www.huffingtonpost.com/entry/california-shooters-likely-planned-multiple-attacks-officials_us_5664c5ace4b08e945fefe153', 'headline': 'california shooters likely planned multiple attacks: officials', 'is_sarcastic': 0}
{'article_link': 'https://www.huffingtonpost.com/entry/hillary-clinton-minumum-wage_us_57139f45e4b06f35cb6fd5da', 'headline': 'hillary clinton clarifies her stance on $15 minimum wage', 'is_sarcastic': 0}
{'article_link': 'https://www.theonion.com/hog-executed-farmland-style-1819567062', 'headline': 'hog executed farmland style', 'is_sarcastic': 1}
{'article_link': 'https://www.huffingtonpost.com/entry/kickstarter-aims-to-bring-book-on-black-boy-joy-to-public-schools-across-america_us_59482efbe4b0edb84c14d17b', 'headline': 'kickstarter aims to give book on black boy joy to public schools', 'is_sarcastic': 0}
{'article_link': 'https://www.theonion.com/new-nba-starter-jackets-to-come-with-unwanted-pregnanci-1819586346', 'headline': 'new nba starter

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [1]:
random = [5, 9, 'cat']

# converting the list to an iterator
random_iterator = iter(random)
print(random_iterator)

<list_iterator object at 0x000001A6E3E80E80>


In [6]:
next(random_iterator)

StopIteration: 