In [0]:
# this notebook is based off of this blog post: 
# https://realpython.com/python-keras-text-classification/#reader-comments

In [0]:
DATASET_NAME = "test"
MODEL_NAME = "test"
MAX_SEQ_LENGTH = 20
TRAINING_SET_SIZE = 1000000
VAL_SET_SIZE = 1000000
NUM_EPOCHS = 15
BATCH_SIZE = 100

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import HashingVectorizer
from scipy.sparse import hstack, vstack
from keras.preprocessing.text import Tokenizer
from tensorflow.keras import Sequential, layers
from keras.utils import plot_model
import pandas as pd
import numpy as np
import copy
import time
import pickle
!pip install gcsfs

pd.set_option('max_colwidth', 100)

Using TensorFlow backend.


Collecting gcsfs
[?25l  Downloading https://files.pythonhosted.org/packages/ab/92/0297f2813cb240c52e90f8587420149970565800e019e1b08ef5ad28b6d9/gcsfs-0.3.1.tar.gz (43kB)
[K     |███████▋                        | 10kB 19.8MB/s eta 0:00:01[K     |███████████████▏                | 20kB 2.2MB/s eta 0:00:01[K     |██████████████████████▊         | 30kB 3.2MB/s eta 0:00:01[K     |██████████████████████████████▎ | 40kB 2.1MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 2.0MB/s 
Building wheels for collected packages: gcsfs
  Building wheel for gcsfs (setup.py) ... [?25l[?25hdone
  Created wheel for gcsfs: filename=gcsfs-0.3.1-py2.py3-none-any.whl size=17936 sha256=4b577dbe95e8323e390331908a987c3bff220b808c15196dc2c2fd023e528132
  Stored in directory: /root/.cache/pip/wheels/9d/2b/6f/86954f0d8caa1173841e62bb780dc0f8693bd268e04a267682
Successfully built gcsfs
Installing collected packages: gcsfs
Successfully installed gcsfs-0.3.1


In [5]:
# this cell is only necessary if running in colab
project_id = 'w266-251323'
import uuid
bucket_name = 'fb-congressional-data/'
from google.colab import auth
auth.authenticate_user()
!gcloud config set project {project_id}

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Updated property [core/project].


In [6]:
train_df = pd.read_csv("gs://fb-congressional-data/train.csv", index_col=0)

  mask |= (ar1 == a)


In [7]:
dev_df = pd.read_csv("gs://fb-congressional-data/dev.csv", index_col=0)

  mask |= (ar1 == a)


In [0]:
# shuffle the data
# be sure to do this before you extract X's and y's!!
train_df = train_df.sample(frac=1)
dev_df = dev_df.sample(frac=1)

In [0]:
y_train = train_df.op_gender.values
y_dev = dev_df.op_gender.values

In [0]:
def turn_to_ints(li):
    final_list = []
    for gender in li:
        if gender=='M':
            final_list.append(1)
        else:
            final_list.append(0)
    return final_list
            
y_train = turn_to_ints(y_train)
y_dev = turn_to_ints(y_dev)

In [0]:
y_train = np.asarray(y_train)
y_dev = np.asarray(y_dev)

In [0]:
def get_text_list(init_list):
    sentences = []
    for sentence in init_list:
        if type(sentence) != str:
            sentences.append("")
        else:
            sentences.append(sentence)
    return sentences

new_sentences_train = get_text_list(train_df.response_text.values)
new_sentences_test = get_text_list(dev_df.response_text.values)

In [13]:
time_start = time.time()

tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(new_sentences_train)

X_train = tokenizer.texts_to_sequences(new_sentences_train)
X_test = tokenizer.texts_to_sequences(new_sentences_test)

vocab_size = len(tokenizer.word_index) + 1

currentTime = time.gmtime(time.time() - time_start)

#Convert the gmtime struct to a string
timeStr = time.strftime("%M minutes, %S seconds", currentTime)

print("Tokenized in {}".format(timeStr))

Tokenized in 09 minutes, 56 seconds


In [0]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 20

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [0]:
x_train_path = 'X_train_{}.pkl'.format(DATASET_NAME)
x_dev_path = 'X_dev_{}.pkl'.format(DATASET_NAME)
y_train_path = 'y_train_{}.pkl'.format(DATASET_NAME)
y_dev_path = 'y_dev_{}.pkl'.format(DATASET_NAME)

with open(x_train_path, 'wb') as file:
    pickle.dump(X_train, file)   
with open(x_dev_path, 'wb') as file:
    pickle.dump(X_test, file)
with open(y_train_path, 'wb') as file:
    pickle.dump(y_train, file)
with open(y_dev_path, 'wb') as file:
    pickle.dump(y_dev, file)

In [33]:
# copy to bucket
!gsutil cp /content/{x_train_path} gs://fb-congressional-data/test
!gsutil cp /content/{x_dev_path} gs://fb-congressional-data/
!gsutil cp /content/{y_train_path} gs://fb-congressional-data/
!gsutil cp /content/{y_dev_path} gs://fb-congressional-data/

Copying file:///content/X_train_test.pkl...
-
Operation completed over 1 objects/753.7 MiB.                                    
Copying file:///content/X_dev_test.pkl [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/
Operation completed over 1 objects/174.9 MiB.                                    
Copying file:///content/y_trai

In [38]:
!gsutil cp gs://fb-congressional-data/glove* /tmp/

Copying gs://fb-congressional-data/glove.6B.100d.txt...
Copying gs://fb-congressional-data/glove.6B.200d.txt...
Copying gs://fb-congressional-data/glove.6B.300d.txt...
Copying gs://fb-congressional-data/glove.6B.50d.txt...
\ [4 files][  2.1 GiB/  2.1 GiB]  100.2 MiB/s                                   
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://fb-congressional-data/glove.6B.zip...
/ [5 files][  2.9 GiB/  2.9 GiB]    9.6 MiB/s                                   
Operation completed over 5 objects/2.9 GiB.                                      


In [0]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

embedding_dim = 100
embedding_matrix = create_embedding_matrix(
                     '/tmp/glove.6B.{}d.txt'.format(embedding_dim),
                      tokenizer.word_index, embedding_dim)

In [51]:
# trying to figure out which words are empty here
counter = 0
empty_indexes = []
for index, row in enumerate(embedding_matrix):
  if sum(row) == 0:
    empty_indexes.append(index)
    counter += 1
  if counter > 1000:
    break

for idx in empty_indexes:
  try:
    print(tokenizer.index_word[idx])
  except:
    print("No entry for {}".format(idx))


No entry for 0
don't
it's
i'm
can't
that's
you're
doesn't
didn't
he's
won't
isn't
let's
what's
obama's
i've
we're
aren't
wouldn't
i'll
they're
she's
i'd
you've
there's
wasn't
haven't
shouldn't
couldn't
you'll
stillsanders
trump's
youtu
fbid
we've
palestinei
here's
hasn't
we'll
people's
america's
gov't
women's
•
who's
you'd
president's
clinton's
god's
weren't
standwithrand
country's
they've
nation's
y'all
hillary's
they'll
children's
🇺🇸
he'll
today's
obummer
ain't
rinos
state's
where's
🇺🇸🇺🇸
doyourjob
huffingtonpost
bush's
american's
we'd
👍
washingtonpost
feelthebern
😍
lmao
woman's
party's
one's
government's
world's
killary
everyone's
❤️
he'd
paul's
they'd
fbcdn
man's
father's
mother's
else's
family's
nobillnobreak
administration's
hphotos
hadn't
bridenstine
it'll
randrally
hahaha
she'll
how's
isil
randpaul
cispa
bengazi
iran's
😊
bernie's
'the
sheeple
israel's
someone's
bernieorbust
😂
would've
veteran's
rino's
neverhillary
reid's
aspx
republican's
😡
ryan's
ya'll
thehill
1073741828
repub


In [0]:
# hmmmm....
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size

In [0]:
embedding_matrix.shape

In [0]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=False))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [0]:
smaller_X_train = X_train[:TRAINING_SET_SIZE]

In [0]:
smaller_y_train = y_train[:TRAINING_SET_SIZE]

In [0]:
smaller_X_dev = X_test[:VAL_SET_SIZE]
smaller_y_dev = y_dev[:VAL_SET_SIZE]

In [0]:
try:
  time_start = time.time()

  history = model.fit(smaller_X_train, smaller_y_train,
                      epochs=NUM_EPOCHS,
                      verbose=True,
                      validation_data=(smaller_X_dev, smaller_y_dev),
                      batch_size=BATCH_SIZE)

  currentTime = time.gmtime(time.time() - time_start)

  #Convert the gmtime struct to a string
  timeStr = time.strftime("%M minutes, %S seconds", currentTime)

  print("Trained in {}".format(timeStr))

except:
  currentTime = time.gmtime(time.time() - time_start)

  #Convert the gmtime struct to a string
  timeStr = time.strftime("%M minutes, %S seconds", currentTime)

  print("Trained in {}".format(timeStr))  

In [0]:
loss, accuracy = model.evaluate(smaller_X_train, smaller_y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_dev, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


In [0]:
# this will only work if you finish the training...hmm 
plot_model(history)