In [1]:
import logging

import nbimporter
import spacy
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from tqdm import tqdm, tqdm_notebook

from helpers import get_data, load_model, unicoder

Using TensorFlow backend.


In [2]:
from notebooks.feature_engineering import get_last_char
from notebooks.lstm_train import get_embeddings, make_sequence
from notebooks.entity_extraction import process_entities, get_features

Importing Jupyter notebook from notebooks/feature_engineering.ipynb
Importing Jupyter notebook from notebooks/lstm_train.ipynb
Importing Jupyter notebook from notebooks/entity_extraction.ipynb


In [3]:
# init logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# init tqdm
try:
    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
        tqdm_notebook().pandas()
    else:
        tqdm.pandas()
except NameError:
    tqdm.pandas()

# init spaCy
nlp = spacy.load('en')  # english corpus
nlp = spacy.load('en_core_web_lg')  # english word embeddings




We want to specify the csv file name and then have functions to:
* import the data
* preprocessing
* pipe into dataframe
* write into temporary csvs
* do the feature engineering
* load the network and run predict
* pick a classifier and generate the output file

Load data from file

In [4]:
df_test = get_data(test_path='./data/test_data.csv', test=True, unicoded=True)
df_test = df_test.head(100)
assert set(['test_id', 'question1', 'question2']).issubset(df_test.columns), "Cant find all necessary columns in provided data"

## LSTM
Get predictions from trained LSTM model, to train the model run the `notebooks/lstm_train.ipynb`

In [18]:
lstm = load_model('./models/lstm-subtract-nodropout', get_embeddings('./models/embedding_matrix.txt'))

INFO:root:loading embeddings from file
INFO:root:Generating weights
INFO:root:Loaded model from models/lstm-subtract-nodropoutmodel
INFO:root:Loaded weights from models/lstm-subtract-nodropoutmodel
INFO:root:Model loading finished with embeddings


add padding to the test data strings

In [23]:
MAX_PAD = 36
q1 = df_test['question1']
q2 = df_test['question2']
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(q1.astype(str).tolist() + q2.astype(str).tolist())
padded_doc1 = pad_sequences(make_sequence(q1, tokenizer), maxlen=MAX_PAD)
padded_doc2 = pad_sequences(make_sequence(q2, tokenizer), maxlen=MAX_PAD)

In [24]:
lstm_preds = lstm.predict([padded_doc1, padded_doc2])
df_lstm = pd.DataFrame({"test_id": df_test['test_id'], "nn_out": lstm_preds.ravel()})

In [25]:
df_lstm.head()

Unnamed: 0,nn_out,test_id
0,0.0003513194,15
1,0.8284402,20
2,0.0417035,21
3,2.494207e-15,23
4,0.9999987,34


# Entities
Extract entities and similarity from the data provided

In [5]:
df_ents = process_entities(df_test)
df_ents = get_features(df_ents)

NameError: global name 'nlp' is not defined

In [None]:
df_ents.head()