In [5]:
import sys
sys.path.insert(0, '..')

In [10]:
import json
import logging
import os

import spacy
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook

from helpers import get_data

In [11]:
# init logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# init tqdm
try:
    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
        tqdm_notebook().pandas()
    else:
        tqdm.pandas()
except NameError:
    tqdm.pandas()

# init spaCy
nlp = spacy.load('en')  # english corpus
nlp = spacy.load('en_core_web_lg')  # english word embeddings




# Entity Extraction

In [12]:
# example
# nlp = spacy.load('en')
# doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

# for ent in doc.ents:
#     print(ent.text, ent.start_char, ent.end_char, ent.label_)

(u'Apple', 0, 5, u'ORG')
(u'U.K.', 27, 31, u'GPE')
(u'$1 billion', 44, 54, u'MONEY')


add entities as a feature

In [14]:
def find_entities(str):
    entities = {}
    if isinstance(str, unicode):
        for token in nlp(unicode(str)).ents:
            if not token.label_ in entities:
                entities[token.label_] = 1
            else:
                entities[token.label_] += 1
        if len(entities) > 0:
            return entities
        else:
            return np.nan
    else:
        return np.nan

# Similarity

In [13]:
# example
# nlp = spacy.load('en_core_web_lg')
# tokens = nlp(u'dog cat banana sasquatch')

# for token in tokens:
#     print(token.text, token.has_vector, token.vector_norm, token.is_oov)

(u'dog', True, 7.0336733, False)
(u'cat', True, 6.6808186, False)
(u'banana', True, 6.7000141, False)
(u'sasquatch', True, 6.9789977, False)


add similarity as a feature

In [15]:
def apply_similarity(str1, str2):
    if isinstance(str1, unicode) and isinstance(str2, unicode):
        return nlp(str1).similarity(nlp(str2))
    else:
        np.nan

# Apply entity extraction and similarity

dummify entities from stored dicts

In [18]:
def rename_entitity_column(df, counter):
    logging.info(df.columns)
    for col in df.columns:
        if col in ENTS:
            new = '{}_{}'.format(col, counter)
            logging.info('renaming {} to {}'.format(col, new))
            df.rename(columns={'{}'.format(col): new}, inplace=True)
    return df


def string_to_dict(dict_string):
    # Convert to proper json format
    dict_string = dict_string.replace("'", '"').replace('u"', '"')
    return json.loads(dict_string)

def get_features(dataframe):
    # get all unique entities
    dataframe.entities1 = dataframe.entities1.progress_apply(lambda x: string_to_dict(x) if isinstance(x, str) else np.nan)
    dataframe = pd.concat([dataframe, dataframe['entities1'].progress_apply(pd.Series).fillna(0)], axis=1)
    rename_entitity_column(dataframe, 1)
    dataframe.entities2 = dataframe.entities2.progress_apply(lambda x: string_to_dict(x) if isinstance(x, str) else np.nan)
    dataframe = pd.concat([dataframe, dataframe['entities2'].progress_apply(pd.Series).fillna(0)], axis=1)
    rename_entitity_column(dataframe, 2)
    return dataframe

process application to data in batches

In [None]:
BATCHSIZE = 1000
df = get_data(unicoded=True)

counter = 1
for batch in range(0, len(df), BATCHSIZE):
    end = batch+BATCHSIZE if batch+BATCHSIZE < len(df) else len(df)
    logging.info('Starting batch {} from {} to {}'.format(counter, batch, end))
    tdf = df.iloc[batch:batch+BATCHSIZE]
    tdf.question1 = tdf.question1.apply(lambda x: unicoder(x))
    tdf.question2 = tdf.question2.apply(lambda x: unicoder(x))
    tdf['entities1'] = tdf.loc[:, 'question1'].progress_apply(lambda x: find_entities(x))
    tdf['entities2'] = tdf.loc[:, 'question2'].progress_apply(lambda x: find_entities(x))
    tdf['similarity_score'] = tdf.progress_apply(lambda row: apply_similarity(row['question1'], row['question2']), axis=1)
    tdf.to_csv('batches/batch_{}.csv'.format(counter))
    counter += 1
logging.info('Finished {} batches'.format(counter))

Looking up all viable entities

In [None]:
ENTS = {u'CARDINAL', u'DATE', u'EVENT', u'FAC', u'GPE', u'LANGUAGE', u'LAW', u'LOC', u'MONEY', u'NORP', u'ORDINAL',
        u'ORG', u'PERCENT', u'PERSON', u'PRODUCT', u'QUANTITY', u'TIME', u'WORK_OF_ART'}

# for row in train_df.loc[:, 'entities1']:
#     if isinstance(row, dict):
#         for k in row.keys():
#             ENTS.add(k)

In [None]:
df = get_data(unicoded=True)
for filename in os.listdir('../batches'):
    if filename.endswith('.csv'):
        batch_df = pd.read_csv('../batches/{}'.format(filename))
        df = train_df.merge(batch_df.loc[:, ['id', 'entities1', 'entities2', 'similarity_score']], on='id', how='left')
        if filename != 'batch_1.csv':
            df['entities1'] = df['entities1_y'].fillna(df['entities1_x'])
            df['entities2'] = df['entities2_y'].fillna(df['entities2_x'])
            df['similarity_score'] = df['similarity_score_y'].fillna(df['similarity_score_x'])
            df.drop(['entities1_x', 'entities1_y'], axis=1, inplace=True)
            df.drop(['entities2_x', 'entities2_y'], axis=1, inplace=True)
            df.drop(['similarity_score_x', 'similarity_score_y'], axis=1, inplace=True)
        logging.info('Merged in {}, new length {}'.format(filename, len(df.loc[df.similarity_score.isna() == False])))
        continue
    else:
        continue
final = get_features(df.loc[train_df.similarity_score.isna() == False])
final.to_csv('data/train_with_sim_and_ents.csv')

apply also to test data

In [None]:
test = pd.read_csv('data/test_data.csv')
test.question1 = test.question1.apply(lambda x: unicoder(x))
test.question2 = test.question2.apply(lambda x: unicoder(x))
test['entities1'] = test.loc[:, 'question1'].progress_apply(lambda x: find_entities(x))
test['entities2'] = test.loc[:, 'question2'].progress_apply(lambda x: find_entities(x))
test['similarity_score'] = test.progress_apply(lambda row: apply_similarity(row['question1'], row['question2']), axis=1)
test.to_csv('data/test_with_sim_and_ents.csv')

In [None]:
test.entities1 = test.entities1.progress_apply(lambda x: string_to_dict(x) if isinstance(x, str) else np.nan)
test = pd.concat([test, test['entities1'].apply(pd.Series).fillna(0)], axis=1)
rename_entitity_column(test, 1)
test.entities2 = test.entities2.progress_apply(lambda x: string_to_dict(x) if isinstance(x, str) else np.nan)
test = pd.concat([test, test['entities2'].apply(pd.Series).fillna(0)], axis=1)
rename_entitity_column(test, 2)