In [4]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import pandas as pd

import tensorflow_datasets as tfds
from collections import Counter
from string import punctuation

# Checking IMBD sentiment analysis 

Analyse random subset of 10k examples to test the presence of which words are candidate distractors

In [5]:
ds = tfds.load('imdb_reviews', split='train + test', shuffle_files=True)
N = 5000
ds = ds.take(N)

In [6]:
# Create word counter
count_neg = Counter()
count_pos = Counter()
for d in ds:
    if d['label'] == 0:
        count_neg += Counter(set(d['text'].numpy().split()))
    elif d['label'] == 1:
        count_pos += Counter(set(d['text'].numpy().split()))
        

for i, j in zip(count_pos.most_common()[:20], count_neg.most_common()[:20]):
    print(i, j)


(b'the', 2453) (b'the', 2460)
(b'and', 2424) (b'a', 2400)
(b'a', 2400) (b'and', 2369)
(b'of', 2368) (b'of', 2349)
(b'to', 2308) (b'to', 2349)
(b'is', 2225) (b'is', 2190)
(b'in', 2153) (b'in', 2123)
(b'this', 2041) (b'this', 2097)
(b'that', 1912) (b'that', 1955)
(b'it', 1851) (b'it', 1919)
(b'for', 1704) (b'I', 1876)
(b'with', 1695) (b'for', 1709)
(b'I', 1693) (b'with', 1683)
(b'as', 1573) (b'was', 1678)
(b'but', 1553) (b'but', 1655)
(b'was', 1519) (b'The', 1560)
(b'The', 1505) (b'on', 1528)
(b'on', 1472) (b'/><br', 1483)
(b'/><br', 1418) (b'have', 1471)
(b'are', 1339) (b'be', 1461)


In [7]:
# filter by whether words 
# a) are equally likely to appear in both pos and neg with tolerance 5% of sentences
# b) appear in roughly half or each.
for c in list(set(count_pos.keys()).union(set(count_pos.keys()))):
    if ((np.abs(count_pos[c] - count_neg[c]) > 0.5*0.05*N)
        or not (0.5*0.44*N < count_pos[c] < 0.5*0.56*N) 
        or not (0.5*0.44*N < count_neg[c] < 0.5*0.56*N)) :
        
        del count_pos[c]
        del count_neg[c]


split_words = sorted(list(count_pos.keys()))
for c in split_words:
    print(c, 200.0*count_pos[c]/N, 200.0*count_neg[c]/N)



b'an' 46.76 46.92
b'are' 53.56 53.56
b'film' 48.04 45.32
b'one' 49.28 48.68
b'you' 45.6 48.52


In [8]:
# exclude linebreak operator
split_words = split_words[1:]

# Create a new dataset that has additional labels

In [13]:
ds =  tfds.load('imdb_reviews', split='train + test', shuffle_files=True)
pd_df = tfds.as_dataframe(ds)

for word in split_words:
    w = word.decode("utf-8")
    label = 'has_'+ w
    regex_word = rf"\b{w}\b"
    pd_df[label] = (pd_df['text'].str.decode("utf-8").str.contains(regex_word)).astype('int')

pd_df['text'] = pd_df['text'].apply(lambda x: x.decode("utf-8"))

NameError: name 'split_words' is not defined

In [7]:
pd_df.head()

Unnamed: 0,label,text,has_are,has_at,has_film,has_one,has_you
0,0,This was an absolutely terrible movie. Don't b...,1,0,0,0,0
1,0,"I have been known to fall asleep during films,...",0,1,1,0,0
2,0,Mann photographs the Alberta Rocky Mountains i...,0,0,0,0,0
3,1,This is the kind of film for a snowy Sunday af...,1,1,1,1,1
4,1,"As others have mentioned, all the women that g...",1,1,1,0,0


# Save that dataset as a zipped csv

In [8]:
saveto = '~/tensorflow_datasets/imdb_reviews/plain_text/'

compression_opts = dict(method='zip',
                        archive_name='out.csv')  
pd_df.to_csv(saveto + 'with_additional_labels.zip', index=False,
          compression=compression_opts)  

In [19]:
# also generate and store vocabulary in fixed order
# standardize
reviews = pd_df['text']
reviews = reviews.apply(lambda x: x.lower())
reviews = reviews.apply(lambda x: ''.join([c for c in x if c not in punctuation]))

# get vocabulary
all_text = ' '.join(reviews)
words = all_text.split()
count_words = Counter(words)
total_words = len(words)
sorted_words = count_words.most_common(total_words)

print('Vocabulary size:', len(sorted_words))

vocab = pd.DataFrame(sorted_words, columns = ['word', 'count'])
vocab = vocab[vocab['count'] > 1]

print('Filtered vocabulary size:', len(vocab))

vocab.to_csv(saveto + 'wordcounts.zip', index=False,compression=compression_opts)  

Vocabulary size: 181685
Filtered vocabulary size: 81743


In [18]:
vocab.tail()

Unnamed: 0,word,count
81738,spore,2
81739,encrypt,2
81740,huntress,2
81741,chepart,2
81742,elr,2


# Search for natural partial exposures

In [11]:
# filter by whether words 
# a) are equally likely to appear in both pos and neg with tolerance 5% of sentences
# b) appear in roughly half or each.
filtered = {}
for c in list(set(count_pos.keys()).union(set(count_pos.keys()))):
    if ((0.4*N < count_pos[c] + count_neg[c] < 0.6*N) ) :
        filtered[c] = (count_pos[c], count_neg[c])
        

filtered

{b'not': (1283, 1418),
 b'have': (1275, 1471),
 b'his': (1078, 949),
 b'by': (1202, 1069),
 b'be': (1246, 1461),
 b'an': (1169, 1173),
 b'film': (1201, 1133),
 b'all': (1078, 1107),
 b'like': (981, 1165),
 b'are': (1339, 1339),
 b'one': (1232, 1217),
 b'movie': (1171, 1383),
 b'from': (1075, 1075),
 b'you': (1140, 1213),
 b'at': (1140, 1276),
 b'/><br': (1418, 1483),
 b'who': (1078, 1003)}

In [32]:
diffs = {}
total = []
words = []
for w, pc in count_pos.items():
    diffs[(count_neg[w] - pc) / (count_neg[w] + pc)] =  w

In [15]:
count_neg['bad'.encode('utf-8')]

662

In [33]:
diffs

{-0.75: b'sailor',
 0.09090909090909091: b'seventies',
 -0.06422018348623854: b'(as',
 -0.22676579925650558: b'life',
 0.07058823529411765: b'go',
 -1.0: b'end!!!',
 -0.10429447852760736: b'New',
 0.20520231213872833: b'no',
 0.2481012658227848: b'plot',
 0.01803713527851459: b'it',
 -0.14285714285714285: b'television,',
 0.0847457627118644: b'couple',
 0.11945392491467577: b'sense',
 0.020508613617719443: b'into',
 -0.22448979591836735: b"haven't",
 0.5: b'Rhodes',
 0.04086021505376344: b'big',
 -0.6: b'Atlantis',
 -0.012329656067488644: b'when',
 -0.003552397868561279: b'with',
 -0.3333333333333333: b'acted.<br',
 -0.13131313131313133: b'dramatic',
 0.047619047619047616: b'field',
 0.056291390728476824: b'at',
 0.15473441108545036: b'kind',
 0.0014247913698351314: b'the',
 -0.011475067807218861: b'and',
 0.0: b'me-',
 0.008803951041442989: b'to',
 -0.5238095238095238: b'heroic',
 -0.49074074074074076: b'performances',
 0.1402805611222445: b'just',
 -0.02913453299057412: b'film',
 0.1

In [34]:
import collections
d = collections.OrderedDict(sorted(diffs.items()))

In [37]:
for i in d.items(): print(i)

(-1.0, b'end!!!')
(-0.9259259259259259, b'perfect,')
(-0.9230769230769231, b'exceptional')
(-0.8947368421052632, b'Highly')
(-0.8888888888888888, b'classical')
(-0.8775510204081632, b'wonderfully')
(-0.875, b'friendship,')
(-0.8666666666666667, b'wonderful.')
(-0.8536585365853658, b'captures')
(-0.8518518518518519, b'simple,')
(-0.8461538461538461, b'9/10')
(-0.8333333333333334, b'riveting')
(-0.8285714285714286, b'magnificent')
(-0.8181818181818182, b'Castle')
(-0.8125, b'amazing.')
(-0.8, b'Buddy')
(-0.7894736842105263, b'appreciated')
(-0.7857142857142857, b'friendship')
(-0.7837837837837838, b'gem')
(-0.7777777777777778, b'good!')
(-0.7647058823529411, b'recognition')
(-0.76, b'10/10')
(-0.75, b'sailor')
(-0.7333333333333333, b'fear,')
(-0.7222222222222222, b'finest')
(-0.7142857142857143, b'"House')
(-0.7, b'Later')
(-0.6923076923076923, b'smiles')
(-0.6842105263157895, b'warmth')
(-0.6764705882352942, b'beautifully')
(-0.6756756756756757, b'haunting')
(-0.6666666666666666, b'Shao

In [30]:
count_pos['nothing'.encode('utf-8')]

152

# Search for short representative reviews for figure

In [24]:
ds = tfds.load('imdb_reviews', split='train + test', shuffle_files=True)
for d in ds: 
    if 'film'.encode('utf-8') not in d['text'].numpy().split() and len(d['text'].numpy().split()) < 15:
        print(d['label'].numpy(), d['text'].numpy())
        print('****************************************')


0 b'I hope this group of film-makers never re-unites.'
****************************************
1 b'This is a great movie. Too bad it is not available on home video.'
****************************************
1 b'Brilliant and moving performances by Tom Courtenay and Peter Finch.'
****************************************
0 b'Read the book, forget the movie!'
****************************************
0 b'Primary plot!Primary direction!Poor interpretation.'
****************************************
0 b"You'd better choose Paul Verhoeven's even if you have watched it."
****************************************
0 b'Ming The Merciless does a little Bardwork and a movie most foul!'
****************************************
0 b'Long, boring, blasphemous. Never have I been so glad to see ending credits roll.'
****************************************
0 b'More suspenseful, more subtle, much, much more disturbing....'
****************************************
0 b'This movie is terrible but it has some 

# Create dataset with `film` subbed out

In [26]:
ds =  tfds.load('imdb_reviews', split='train + test', shuffle_files=True)
pd_df = tfds.as_dataframe(ds)


In [27]:
sub_word = 'film'
regex_sub_word = rf"\b{sub_word}\b"
sub_with = 'AnyRandomWordNotInVocab'
regex_sub_with = rf"\b{sub_with}\b"


label = 'has_'+ sub_word
pd_df[label] = (pd_df['text'].str.decode("utf-8").str.contains(regex_sub_word)).astype('int')

pd_df['text'] = pd_df['text'].str.decode("utf-8").str.replace(regex_sub_word, regex_sub_with, regex = True)


In [28]:
pd_df.head()

Unnamed: 0,label,text,has_film
0,0,This was an absolutely terrible movie. Don't b...,0
1,0,"I have been known to fall asleep during films,...",1
2,0,Mann photographs the Alberta Rocky Mountains i...,0
3,1,This is the kind of AnyRandomWordNotInVocab ...,1
4,1,"As others have mentioned, all the women that g...",1


In [29]:
saveto = '~/tensorflow_datasets/imdb_reviews/plain_text/'

compression_opts = dict(method='zip',
                        archive_name='out.csv')  
pd_df.to_csv(saveto + sub_word + '_subbed.zip', index=False,
          compression=compression_opts)  