# Trying to implement GLOVE embeddings

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import keras
import mysql.connector
import re
from sklearn.preprocessing import QuantileTransformer

%matplotlib inline

# change the default font size in figures to be larger
font = {'size'   : 15}

plt.rc('font', **font)

Using TensorFlow backend.


In [2]:
# connect to the database of wait wait don't tell me transcripts
cnx = mysql.connector.connect(database='wait_wait',
                              user='root')

In [3]:
# function to pull some transcripts from the database
def pull_transcript(n=5):
    # instantiate a cursor to select data from the database
    curs = cnx.cursor()
    curs.execute(f'select * from transcripts limit {n}')
    
    # pull the data and convert to a pandas dataframe
    df = pd.DataFrame(data = np.array(curs.fetchmany(n)),columns=curs.column_names)
    df = df.set_index('id')
    
    # close the cursor
    curs.close()
    return df

In [4]:
num_transcripts = 4000
transcript_df = pull_transcript(n=num_transcripts)


In [5]:
# We want to go ahead and split the tables into testing and training sets, so that we don't over-fit. 
np.random.seed(42)
transcript_df['train'] = np.random.rand(num_transcripts)>.2

# divide into two data structures
test_transcript_df = transcript_df.loc[transcript_df.train==False,:]
train_transcript_df = transcript_df.loc[transcript_df.train,:]

In [6]:
def line_info(transcript_numbers):
    
    # extract the transcript, and divide it by lines
    transcript = transcript_df.loc[transcript_numbers,'transcript'].str.cat();
    by_lines = transcript.split('\n    ')
    df = pd.DataFrame(by_lines,columns=['lines']) # turn the lines into a dataframe
    
    # if the line contains "LAUGHTER", mark the previous line as "funny"
    df['funny'] = df['lines'].str.contains('LAUGHTER')
    df['funny'] = df.funny.shift(-1) # need funny to modify the previous line
    
    # if the line contains "APPLAUSE", mark the previous line as "clapping"
    df['clapping'] = df['lines'].str.contains('APPLAUSE')
    df['clapping'] = df['lines'].str.contains('CLAPPING')
    df['clapping'] = df['lines'].str.contains('CHEERING')
    df['clapping'] = df.clapping.shift(-1)

    # for the remaining lines, identify the speaker using regular expressions
    df['speaker'] = df['lines'].str.extract(r'([A-Z]+:)',expand=False).str.replace(r':','')
    df['lines'] = df['lines'].str.replace(r'.+?(?=:)','').str.replace(r':','')
    df['speaker'] = df['speaker'].str.replace('JR','BLOUNT') # disambiguating Roy Blount Jr
    df['speaker'] = df['speaker'].str.replace('HOST','SAGAL') # sometimes refers to Sagal as host
    # NOTE: There's some weird formatting with Peter Sagal's first line, 
    # where it calls him "Host" and puts his statement on the next line. It's not a huge thing,
    # so I'm leaving it for now, but might be worth coming back to later.
    
    # drop the lines with no speaker (Applause markers, empty lines, etc.)
    df = df.dropna()
    
    # Create a column with the number of separated words
    df['num_words'] = df.lines.str.split().str.len()
    
    # Normalize the number of words with a quantile transformer
    QT = QuantileTransformer(n_quantiles=1000,output_distribution='uniform')
    QT.fit(df.num_words.values.reshape(-1,1)) 
    df['uniform_words'] = QT.transform(df.num_words.values.reshape(-1,1))
    
    # Create a column with whether the previous line was funny (for predictions)
    df['prev_line_funny'] = df.funny.shift(1)

    return df

In [7]:
# Process the training set and show the top of the dataframe
lines_df = line_info(train_transcript_df.index)
lines_df.head()

Unnamed: 0,lines,funny,clapping,speaker,num_words,uniform_words,prev_line_funny
4,"From NPR and WBEZ Chicago, this is WAIT WAIT....",True,False,KURTIS,25,0.8358358,
6,I'm Bill Kurtis. And here's your host at the ...,False,False,KURTIS,17,0.7347347,True
7,,False,False,SAGAL,0,1e-07,False
10,Thank you so much. We have a very interesting...,True,False,SAGAL,56,0.9654655,False
12,"But first, as many of you know, the NPR podca...",True,False,SAGAL,53,0.9604605,True


## Embeddings

In [None]:
from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant


BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
                with open(fpath, **args) as f:
                    t = f.read()
                    i = t.find('\n\n')  # skip header
                    if 0 < i:
                        t = t[i:]
                    texts.append(t)
                labels.append(label_id)

print('Found %s texts.' % len(texts))

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val))

In [9]:
from keras.datasets import reuters

(x_train, y_train), (x_test, y_test) = reuters.load_data(path="reuters.npz",
                                                         num_words=None,
                                                         skip_top=0,
                                                         maxlen=None,
                                                         test_split=0.2,
                                                         seed=113,
                                                         start_char=1,
                                                         oov_char=2,
                                                         index_from=3)

Downloading data from https://s3.amazonaws.com/text-datasets/reuters.npz


In [11]:
y_train[:10]

array([ 3,  4,  3,  4,  4,  4,  4,  3,  3, 16])