In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from embeddings import get_embedding_matrix, sentence_to_indexes
from emb_path import glove_6B_300d_path, lexvec_7B_300d_path, glove_6B_50d_path
from sklearn.metrics.pairwise import cosine_similarity
from gem import SentenceEmbedder
from scipy.stats import pearsonr
from utils import read_sts
import os
from tqdm import tqdm_notebook as tqdm
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Prepare datasets

In [2]:
IMDB_PATH = './data/imdb'

IMDB_TRAIN = os.path.join(IMDB_PATH, 'train')
IMDB_TRAIN_POS = os.path.join(IMDB_TRAIN, 'pos')
IMDB_TRAIN_NEG = os.path.join(IMDB_TRAIN, 'neg')

IMDB_TEST = os.path.join(IMDB_PATH, 'test')

In [3]:
train_pos_samples = [os.path.join(IMDB_TRAIN_POS, s) for s in os.listdir(IMDB_TRAIN_POS)]
train_neg_samples = [os.path.join(IMDB_TRAIN_NEG, s) for s in os.listdir(IMDB_TRAIN_NEG)]

test_samples = [os.path.join(IMDB_TEST, s) for s in os.listdir(IMDB_TEST)]

In [4]:
print(len(train_pos_samples), len(train_neg_samples), len(test_samples))

12500 12501 11001


# Process every review

In [5]:
import nltk.data

In [6]:
# nltk.download('punkt')

In [7]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [9]:
pos_reviews = []
neg_reviews = []

try:
    for review in tqdm(train_pos_samples[:]):
        with open(review, 'r', encoding="utf-8") as file:
            data = file.read().replace('\n', '')
            data = data.replace('<br />', ' ')
            data = data.replace('.', '')
    #         data_sentences = tokenizer.tokenize(data)
            pos_reviews += [data]
except UnicodeDecodeError:
    pass
    
try:
    for review in tqdm(train_neg_samples[:]):
        with open(review, 'r', encoding="utf-8") as file:
            data = file.read().replace('\n', '')
            data = data.replace('<br />', ' ')
            data = data.replace('.', '')
    #         data_sentences = tokenizer.tokenize(data)
            neg_reviews += [data]
except UnicodeDecodeError:
    pass

HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12501), HTML(value='')))

In [10]:
labels = [1]*len(pos_reviews) + [0]*len(neg_reviews)
reviews = pos_reviews + neg_reviews
print(len(reviews))

25000


In [11]:
reviews[0]

'Bromwell High is a cartoon comedy It ran at the same time as some other programs about school life, such as "Teachers" My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers" The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled  at  High A classic line: INSPECTOR: I\'m here to sack one of your teachers STUDENT: Welcome to Bromwell High I expect that many adults of my age think that Bromwell High is far fetched What a pity that it isn\'t!'

# Compute embeddings

In [12]:
embds = {"Glove": get_embedding_matrix(glove_6B_300d_path)}

In [13]:
%%time

embeddings = []

print('Unigrams:\n\n')

for e_name, (e, v) in embds.items():
    model = SentenceEmbedder(reviews, e, v, False)
    result, s = model.gem(sigma_power=3)
    embeddings += [result]

Unigrams:




25000it [22:18, 18.67it/s]
4767it [24:30,  3.24it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

25000it [2:09:01,  3.23it/s]

CPU times: user 13h 48min 37s, sys: 1d 18h 1min 28s, total: 2d 7h 50min 5s
Wall time: 2h 31min 26s





In [16]:
embeddings[0].shape

(25000, 300)

In [17]:
# np.save('data/imdb_embeddings.npy', embeddings[0])

# LogisticRegression

In [18]:
from sklearn.linear_model import LogisticRegression

In [48]:
imdb_embeddings = np.load('data/imdb_embeddings.npy')

x_train = np.append(imdb_embeddings[:10000], imdb_embeddings[12500:22500], axis=0)
y_train = np.array([1]*10000+[0]*10000)

x_test = np.append(imdb_embeddings[10000:12500], imdb_embeddings[22500:], axis=0)
y_test = np.array([1]*2500+[0]*2500)

In [49]:
lr = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')

lr.fit(x_train, y_train)

y_preds = lr.predict_proba(x_test)[:,1]

accuracy = ((y_preds > 0.5).astype(int) == y_test).mean()

print(f'Accuracy = {accuracy}')

Accuracy = 0.7328


# Quora Question Pairs

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import numpy as np
from embeddings import get_embedding_matrix, sentence_to_indexes
from emb_path import glove_6B_300d_path, lexvec_7B_300d_path, glove_6B_50d_path
from sklearn.metrics.pairwise import cosine_similarity
from gem import SentenceEmbedder

In [3]:
data = pd.read_csv('data/quora/train.csv')
data = data.dropna()

ind_tr, ind_te = train_test_split(np.arange(len(data)), stratify=data['is_duplicate'], random_state=24, test_size=0.1)

In [4]:
q1 = data['question1']
q2 = data['question2']

sentences = q1.tolist() + q2.tolist()

In [5]:
e, v = get_embedding_matrix(glove_6B_300d_path)

In [6]:
model = SentenceEmbedder(sentences, e, v, 3)

In [7]:
embedded_sentences, _ = model.gem(sigma_power=3)

100%|██████████| 808574/808574 [01:36<00:00, 8412.72it/s]
100%|██████████| 808574/808574 [13:32<00:00, 995.02it/s] 


In [8]:
question1_embs = embedded_sentences[:len(sentences) // 2]
question2_embs = embedded_sentences[-len(sentences) // 2:]

In [9]:
q1_train = question1_embs[ind_tr]
q2_train = question2_embs[ind_tr]

q1_test = question1_embs[ind_te]
q2_test = question2_embs[ind_te]

y_train = data['is_duplicate'].values[ind_tr]
y_test = data['is_duplicate'].values[ind_te]

In [10]:
np.save('data/q1_embs.npy', question1_embs)
np.save('data/q2_embs.npy', question2_embs)

## Unsupervised

In [None]:
%%time

cos_sims = cosine_similarity(q1_test, q2_test)

cosine_sims = cos_sims.diagonal()

In [45]:
accuracy = ((cosine_sims > 0.8).astype(int) == data_te['is_duplicate'].values).mean()

print(f'Accuracy = {accuracy}')

Accuracy = 0.6610106606643745


## Supervised

In [23]:
x_train = np.hstack((q1_train, q2_train))

x_test = np.hstack((q1_test, q2_test))

In [24]:
lr = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')

lr.fit(x_train, y_train)

y_preds = lr.predict_proba(x_test)[:,1]

accuracy = ((y_preds > 0.5).astype(int) == y_test).mean()

print(f'Accuracy = {accuracy}')

Accuracy = 0.6612085384253877
