In [1]:
import json, gzip
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
import itertools
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /Users/evan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [2]:
def read_json1(path):
    data = []
    
    with gzip.open(path) as f:
        for ln in f:
            obj = json.loads(ln)
            data.append(obj)
            
    return data

In [3]:
def text_to_word_list(text):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

In [4]:
def encode_dataset(dataset, vocab, inv_vocab, build_vocab=False, emb=None):
    cols = ['text', 'summary']
    for index, row in dataset.iterrows():
        # Iterate through the text of both questions of the row
        for col in cols:
            # text to number representation
            t2n = [1]  
            for word in text_to_word_list(row[col]):
                if build_vocab:
                    # Leave out words without embeddings
                    if word not in emb.vocab:
                        continue
                    elif word not in vocab:
                        vocab[word] = len(vocab)
                        t2n.append(len(inv_vocab))
                        inv_vocab[len(inv_vocab)] = word
                    else:
                        t2n.append(vocab[word])
                else:
                    if word not in vocab:
                        continue
                    else:
                        t2n.append(vocab[word])
            t2n.append(2)
            # Replace article or summary text with number representation
            dataset.at[index, col] = t2n

In [5]:
def save_vocabs(vocab, inv_vocab):
    with open("vocab.pkl","wb") as f:
        pickle.dump(vocab, f)

    with open("inv_vocab.pkl", "wb") as f:
        pickle.dump(inv_vocab, f)

### Initialize vocabs and word embeddings

In [8]:
word2vec_path = '/Users/evan/data/deep_learning/GoogleNews-vectors-negative300.bin'

# used to encode words
vocab = {"PAD": 0, "SOS": 1, "EOS": 2, "UNK": 3} 

# used to decode (encoded) words
inv_vocab = {0: "PAD", 1: "SOS", 2: "EOS", 3: "UNK"} 

# load pre-trained word embeddings
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

### Read and encode train data. Generate vocab

In [10]:
cols = ['text', 'summary']
path_train = "/Users/evan/data/deep_learning/release/train.jsonl.gz"
path_val = "/Users/evan/data/deep_learning/release/dev.jsonl.gz"

train_data = read_json1(path_train)
val_data = read_json1(path_val)

df = pd.DataFrame(train_data)
val = pd.DataFrame(val_data)

In [17]:
df.coverage.describe()

count    995041.000000
mean          0.825464
std           0.179454
min           0.000000
25%           0.733333
50%           0.877551
75%           0.967742
max           1.000000
Name: coverage, dtype: float64

In [25]:
df = df[df.coverage > 0.73]
val = val[val.coverage > 0.73]
df.shape

(423008, 12)

In [26]:
df = df[(df.compression > 10) & (df.compression < 44)]
val = val[(val.compression > 10) & (val.compression < 44)]
train_df = df[cols]
val_df = val[cols]
train_df.head()

Unnamed: 0,text,summary
1,"WASHINGTON, Dec. 23 - The National Security Ag...","The volume of information harvested, without \..."
2,IF outsized executive pay has indeed become a ...,The battle between Pfizer Inc.'s investors and...
3,BY A.J. BENZA & MICHAEL LEWITTES\n\nIf Simon R...,"If Simon Rex looks a little familiar, it may n..."
6,With Police Commissioner Bernard Kerik crackin...,By JOHN MARZULLI DAILY NEWS POLICE BUREAU CHIE...
8,BY GEORGE RUSH AND JOANNA MOLLOY With Kasia An...,Did Tatum O'Neal's latest battle with ex-husba...


In [27]:
train_df.shape

(423008, 2)

In [28]:
val_df.shape

(46289, 2)

In [34]:
train_df['text_length'] = [len(x.split()) for x in train_df.text]
val_df['text_length'] = [len(x.split()) for x in val_df.text]
train_df['summ_length'] = [len(x.split()) for x in train_df.summary]
val_df['summ_length'] = [len(x.split()) for x in val_df.summary]
train_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

Unnamed: 0,text,summary,text_length,summ_length,compression
1,"WASHINGTON, Dec. 23 - The National Security Ag...","The volume of information harvested, without \...",643,17,37.823529
2,IF outsized executive pay has indeed become a ...,The battle between Pfizer Inc.'s investors and...,728,23,31.652174
3,BY A.J. BENZA & MICHAEL LEWITTES\n\nIf Simon R...,"If Simon Rex looks a little familiar, it may n...",790,72,10.972222
6,With Police Commissioner Bernard Kerik crackin...,By JOHN MARZULLI DAILY NEWS POLICE BUREAU CHIE...,1299,74,17.554054
8,BY GEORGE RUSH AND JOANNA MOLLOY With Kasia An...,Did Tatum O'Neal's latest battle with ex-husba...,1142,65,17.569231


In [35]:
train_df['compression'] = train_df.text_length/train_df.summ_length
val_df['compression'] = val_df.text_length/val_df.summ_length

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [36]:
train_df.describe()

Unnamed: 0,text_length,summ_length,compression
count,423008.0,423008.0,423008.0
mean,579.201228,26.433807,23.111404
std,290.141769,12.463791,9.346503
min,41.0,1.0,5.722222
25%,362.0,19.0,15.1
50%,537.0,24.0,21.761905
75%,754.0,31.0,30.178571
max,12254.0,615.0,217.0


In [37]:
val_df.describe()

Unnamed: 0,text_length,summ_length,compression
count,46289.0,46289.0,46289.0
mean,578.243665,26.484888,23.041093
std,291.487998,12.770314,9.340513
min,58.0,1.0,7.166667
25%,359.0,19.0,15.0
50%,535.0,24.0,21.663551
75%,752.0,31.0,30.071429
max,8506.0,508.0,119.5


In [43]:
train_df2 = train_df[(train_df.text_length > 300)
                     & (train_df.text_length < 800)
                     & (train_df.summ_length > 15) 
                    & (train_df.summ_length < 40)
                    & (train_df.compression > 12)
                    & (train_df.compression < 35)]
train_df2.shape

(190802, 5)

In [45]:
val_df2 = val_df[(val_df.text_length > 300)
                     & (val_df.text_length < 800)
                     & (val_df.summ_length > 15) 
                    & (val_df.summ_length < 40)
                    & (val_df.compression > 12)
                    & (val_df.compression < 35)]
val_df2.shape

(20892, 5)

In [49]:
train_df2.describe()

Unnamed: 0,text_length,summ_length,compression
count,190802.0,190802.0,190802.0
mean,533.587698,25.240385,21.786813
std,135.65345,5.693368,5.996497
min,301.0,16.0,12.025641
25%,421.0,21.0,16.826087
50%,523.0,24.0,21.1
75%,644.0,29.0,26.344828
max,799.0,39.0,34.954545


In [54]:
combo_df = pd.concat([train_df2, val_df2], ignore_index=True)
combo_df.head()

Unnamed: 0,text,summary,text_length,summ_length,compression
0,IF outsized executive pay has indeed become a ...,The battle between Pfizer Inc.'s investors and...,728,23,31.652174
1,"John Edwards' former aide said he is ""skeptica...",John Edwards' former aide Andrew Young says t...,409,28,14.607143
2,has agreed to pay $110 million to settle consu...,JPMorgan Chase has agreed to pay $110 million...,404,19,21.263158
3,JOSEPH AYERS was crouched over a laptop in a c...,"Researchers developing robotic lobsters, flies...",705,26,27.115385
4,P.J. CLARKE'S ON THE HUDSON | SATISFACTORY\n\n...,P.J. Clarke's on the Hudson is better than sno...,708,21,33.714286


In [56]:
combo_df.shape

(211694, 5)

In [59]:
train_df3 = train_df2
val_df3 = val_df2

In [None]:
encode_dataset(combo_df, vocab, inv_vocab, build_vocab=True, emb=word2vec)

In [58]:
combo_df.head()

Unnamed: 0,text,summary,text_length,summ_length,compression
0,"[1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...","[1, 16, 31, 20, 21, 22, 23, 24, 25, 150, 68, 1...",728,23,31.652174
1,"[1, 164, 338, 339, 340, 163, 61, 90, 341, 97, ...","[1, 164, 338, 339, 340, 354, 355, 262, 16, 52,...",409,28,14.607143
2,"[1, 8, 407, 7, 59, 482, 483, 484, 485, 132, 48...","[1, 528, 630, 8, 407, 7, 59, 482, 483, 484, 48...",404,19,21.263158
3,"[1, 631, 632, 385, 633, 75, 634, 47, 635, 636,...","[1, 752, 760, 711, 761, 762, 763, 767, 768, 97...",705,26,27.115385
4,"[1, 917, 918, 919, 41, 16, 920, 921, 427, 110,...","[1, 917, 918, 919, 41, 16, 920, 90, 853, 127, ...",708,21,33.714286


In [60]:
encode_dataset(train_df2, vocab, inv_vocab, emb=word2vec)

In [61]:
encode_dataset(val_df2, vocab, inv_vocab, emb=word2vec)

In [76]:
train_df2.drop(columns=['level_0', 'index'], inplace=True)
train_df2.head()

Unnamed: 0,text,summary,text_length,summ_length,compression
0,"[1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...","[1, 16, 31, 20, 21, 22, 23, 24, 25, 150, 68, 1...",728,23,31.652174
1,"[1, 164, 338, 339, 340, 163, 61, 90, 341, 97, ...","[1, 164, 338, 339, 340, 354, 355, 262, 16, 52,...",409,28,14.607143
2,"[1, 8, 407, 7, 59, 482, 483, 484, 485, 132, 48...","[1, 528, 630, 8, 407, 7, 59, 482, 483, 484, 48...",404,19,21.263158
3,"[1, 631, 632, 385, 633, 75, 634, 47, 635, 636,...","[1, 752, 760, 711, 761, 762, 763, 767, 768, 97...",705,26,27.115385
4,"[1, 917, 918, 919, 41, 16, 920, 921, 427, 110,...","[1, 917, 918, 919, 41, 16, 920, 90, 853, 127, ...",708,21,33.714286


In [78]:
val_df2.drop(columns=['level_0', 'index'], inplace=True)
val_df2.head()

Unnamed: 0,text,summary,text_length,summ_length,compression
0,"[1, 141, 27985, 1907, 4217, 718, 1538, 1060, 1...","[1, 1060, 1184, 9781, 781, 8153, 12564, 3858, ...",516,32,16.125
1,"[1, 47, 52, 1318, 10385, 6517, 12543, 16, 694,...","[1, 12543, 1186, 712, 10511, 8110, 145, 320, 9...",565,17,33.235294
2,"[1, 70, 16, 23828, 3437, 28360, 14235, 145, 29...","[1, 41, 320, 1956, 9097, 551, 4208, 628, 2325,...",484,28,17.285714
3,"[1, 43, 399, 1594, 6502, 19817, 90, 390, 1841,...","[1, 16, 339, 551, 1367, 12088, 1956, 5600, 487...",367,16,22.9375
4,"[1, 1546, 4998, 78, 16, 13929, 11574, 5272, 91...","[1, 16, 17219, 90, 907, 551, 526, 7297, 1080, ...",443,36,12.305556


In [63]:
len(vocab), len(inv_vocab)

(107447, 107447)

In [64]:
save_vocabs(vocab, inv_vocab)

In [79]:
with open("train_df_new.pkl","wb") as f:
    pickle.dump(train_df2, f)
    
with open("val_df_new.pkl","wb") as f:
    pickle.dump(val_df2, f)

In [14]:
save_vocabs(vocab, inv_vocab)
train_df.to_csv('train_df.csv', index=False)

### Read and encode dev/valid data

In [None]:
cols = ['text', 'summary']
path = "dev.jsonl.gz"

dev_data = read_json1(path)

df = pd.DataFrame(dev_data)
dev_df = pd.DataFrame(dev_data, columns=cols)

In [None]:
df.head()

In [None]:
dev_df.head()

In [None]:
encode_dataset(dev_df, vocab, inv_vocab)
dev_df.head()

In [None]:
dev_df.to_csv('dev_df.csv', index=False)

### Read and encode test data

In [None]:
cols = ['text', 'summary']
path = "test.jsonl.gz"

test_data = read_json1(path)

df = pd.DataFrame(test_data)
test_df = pd.DataFrame(test_data, columns=cols)

In [None]:
df.head()

In [None]:
test_df.head()

In [None]:
encode_dataset(test_df, vocab, inv_vocab)
test_df.head()

In [None]:
test_df.to_csv('test_df.csv', index=False)

### Read in saved vocab, inverse vocab, and train data

In [66]:
with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

with open('inv_vocab.pkl', 'rb') as f:
    inv_vocab = pickle.load(f)
    
#train_df = pd.read_csv('train_df.csv')

In [67]:
len(vocab), len(inv_vocab)

(107447, 107447)

In [6]:
with open('train_df_new.pkl', 'rb') as f:
    train_df_new = pickle.load(f)
train_df_new.head()

Unnamed: 0,text,summary,text_length,summ_length,compression
0,"[1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...","[1, 16, 31, 20, 21, 22, 23, 24, 25, 150, 68, 1...",728,23,31.652174
1,"[1, 164, 338, 339, 340, 163, 61, 90, 341, 97, ...","[1, 164, 338, 339, 340, 354, 355, 262, 16, 52,...",409,28,14.607143
2,"[1, 8, 407, 7, 59, 482, 483, 484, 485, 132, 48...","[1, 528, 630, 8, 407, 7, 59, 482, 483, 484, 48...",404,19,21.263158
3,"[1, 631, 632, 385, 633, 75, 634, 47, 635, 636,...","[1, 752, 760, 711, 761, 762, 763, 767, 768, 97...",705,26,27.115385
4,"[1, 917, 918, 919, 41, 16, 920, 921, 427, 110,...","[1, 917, 918, 919, 41, 16, 920, 90, 853, 127, ...",708,21,33.714286


In [11]:
train_df_new['length2'] = train_df_new.text.map(lambda x: len(x))
train_df_new.head()

Unnamed: 0,text,summary,text_length,summ_length,compression,length2
0,"[1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...","[1, 16, 31, 20, 21, 22, 23, 24, 25, 150, 68, 1...",728,23,31.652174,643
1,"[1, 164, 338, 339, 340, 163, 61, 90, 341, 97, ...","[1, 164, 338, 339, 340, 354, 355, 262, 16, 52,...",409,28,14.607143,367
2,"[1, 8, 407, 7, 59, 482, 483, 484, 485, 132, 48...","[1, 528, 630, 8, 407, 7, 59, 482, 483, 484, 48...",404,19,21.263158,355
3,"[1, 631, 632, 385, 633, 75, 634, 47, 635, 636,...","[1, 752, 760, 711, 761, 762, 763, 767, 768, 97...",705,26,27.115385,617
4,"[1, 917, 918, 919, 41, 16, 920, 921, 427, 110,...","[1, 917, 918, 919, 41, 16, 920, 90, 853, 127, ...",708,21,33.714286,620


In [15]:
train_df_new.describe()

Unnamed: 0,text_length,summ_length,compression,length2
count,190802.0,190802.0,190802.0,190802.0
mean,533.587698,25.240385,21.786813,474.481745
std,135.65345,5.693368,5.996497,122.320603
min,301.0,16.0,12.025641,119.0
25%,421.0,21.0,16.826087,374.0
50%,523.0,24.0,21.1,465.0
75%,644.0,29.0,26.344828,573.0
max,799.0,39.0,34.954545,1289.0


In [16]:
train_df_new.length2.quantile(.999)

753.0

In [24]:
train_df_new[train_df_new.length2 == 1289.0].text[-10:]

165255    [1, 41, 3423, 12798, 1639, 39, 5435, 182, 1802...
Name: text, dtype: object

## Appendix

In [None]:
text_lens = train_df.text.map(lambda x: len(x))
summary_lens = train_df.summary.map(lambda x: len(x))

In [None]:
text_lens.describe(), summary_lens.describe()

In [55]:
sub_train_df = train_df[(text_lens < 750) & (summary_lens < 26)]

In [56]:
sub_text_lens = sub_train_df.text.map(lambda x: len(x))
sub_summary_lens = sub_train_df.summary.map(lambda x: len(x))

In [62]:
sub_train_df.loc[:, 'text'] = sub_train_df['text'].apply(lambda x: ((sub_text_lens.max() - len(x)) * [0]) + x)
sub_train_df.loc[:, 'summary'] = sub_train_df['summary'].apply(lambda x: x + ((sub_summary_lens.max() - len(x)) * [0]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [58]:
embedding_dim = 300
embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim)  # This will be the embedding matrix
embeddings[0] = 0  # So that the padding will be ignored

In [64]:
sub_train_df

Unnamed: 0,text,summary
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 759, 760, 761, ..."
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 8, 951, 944, 945, 946, 947, 69..."
12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
14,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
20,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1154, 115, 8, 3301, 3302, 3303, 8, 5..."
23,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
24,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 981, 3674, 870,..."
35,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[380, 1589, 3158, 37, 5434, 409, 502, 228, 128..."
40,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
41,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [65]:
sub_train_df.to_csv('sub_train_df.csv', index=False)