In [1]:
import json, gzip
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
import itertools
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/briandorsey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
def read_json1(path):
    data = []
    
    with gzip.open(path) as f:
        for ln in f:
            obj = json.loads(ln)
            data.append(obj)
            
    return data

In [37]:
def text_to_word_list(text):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

In [38]:
def encode_dataset(dataset, vocab, inv_vocab, build_vocab=False, emb=None):
    cols = ['text', 'summary']
    for index, row in dataset.iterrows():
        # Iterate through the text of both questions of the row
        for col in cols:
            # text to number representation
            t2n = [1]  
            for word in text_to_word_list(row[col]):
                if build_vocab:
                    # Leave out words without embeddings
                    if word not in emb.vocab:
                        continue
                    elif word not in vocab:
                        vocab[word] = len(vocab)
                        t2n.append(len(inv_vocab))
                        inv_vocab[len(inv_vocab)] = word
                    else:
                        t2n.append(vocab[word])
                else:
                    if word not in vocab:
                        continue
                    else:
                        t2n.append(vocab[word])
            t2n.append(2)
            # Replace article or summary text with number representation
            dataset.at[index, col] = t2n

In [39]:
def save_vocabs(vocab, inv_vocab):
    with open("vocab.pkl","wb") as f:
        pickle.dump(vocab, f)

    with open("inv_vocab.pkl", "wb") as f:
        pickle.dump(inv_vocab, f)

### Initialize vocabs and word embeddings

In [None]:
word2vec_path = '~/Downloads/GoogleNews-vectors-negative300.bin'

# used to encode words
vocab = {"PAD": 0, "SOS": 1, "EOS": 2, "UNK": 3} 

# used to decode (encoded) words
inv_vocab = {0: "PAD", 1: "SOS", 2: "EOS", 3: "UNK"} 

# load pre-trained word embeddings
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

### Read and encode train data. Generate vocab

In [3]:
cols = ['text', 'summary']
path = "train.jsonl.gz"

train_data = read_json1(path)

df = pd.DataFrame(train_data)
train_df = pd.DataFrame(train_data, columns=cols)

In [35]:
df.head()

Unnamed: 0,hello
0,2


In [24]:
train_df.head()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [22]:
encode_dataset(train_df, vocab, inv_vocab, build_vocab=True, word2vec)
train_df.head()

In [None]:
len(vocab), len(inv_vocab)

In [14]:
save_vocabs(vocab, inv_vocab)
train_df.to_csv('train_df.csv', index=False)

### Read and encode dev/valid data

In [None]:
cols = ['text', 'summary']
path = "dev.jsonl.gz"

dev_data = read_json1(path)

df = pd.DataFrame(dev_data)
dev_df = pd.DataFrame(dev_data, columns=cols)

In [None]:
df.head()

In [None]:
dev_df.head()

In [None]:
encode_dataset(dev_df, vocab, inv_vocab)
dev_df.head()

In [None]:
dev_df.to_csv('dev_df.csv', index=False)

### Read and encode test data

In [None]:
cols = ['text', 'summary']
path = "test.jsonl.gz"

test_data = read_json1(path)

df = pd.DataFrame(test_data)
test_df = pd.DataFrame(test_data, columns=cols)

In [None]:
df.head()

In [None]:
test_df.head()

In [None]:
encode_dataset(test_df, vocab, inv_vocab)
test_df.head()

In [None]:
test_df.to_csv('test_df.csv', index=False)

### Read in saved vocab, inverse vocab, and train data

In [None]:
with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

with open('inv_vocab.pkl', 'rb') as f:
    inv_vocab = pickle.load(f)
    
train_df = pd.read_csv('train_df.csv')

## Appendix

In [None]:
text_lens = train_df.text.map(lambda x: len(x))
summary_lens = train_df.summary.map(lambda x: len(x))

In [None]:
text_lens.describe(), summary_lens.describe()

In [55]:
sub_train_df = train_df[(text_lens < 750) & (summary_lens < 26)]

In [56]:
sub_text_lens = sub_train_df.text.map(lambda x: len(x))
sub_summary_lens = sub_train_df.summary.map(lambda x: len(x))

In [62]:
sub_train_df.loc[:, 'text'] = sub_train_df['text'].apply(lambda x: ((sub_text_lens.max() - len(x)) * [0]) + x)
sub_train_df.loc[:, 'summary'] = sub_train_df['summary'].apply(lambda x: x + ((sub_summary_lens.max() - len(x)) * [0]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [58]:
embedding_dim = 300
embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim)  # This will be the embedding matrix
embeddings[0] = 0  # So that the padding will be ignored

In [64]:
sub_train_df

Unnamed: 0,text,summary
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 759, 760, 761, ..."
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 8, 951, 944, 945, 946, 947, 69..."
12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
14,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
20,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1154, 115, 8, 3301, 3302, 3303, 8, 5..."
23,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
24,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 981, 3674, 870,..."
35,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[380, 1589, 3158, 37, 5434, 409, 502, 228, 128..."
40,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
41,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [65]:
sub_train_df.to_csv('sub_train_df.csv', index=False)