In [22]:
import difflib
import json
import logging

import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook

In [23]:
# init logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# init tqdm
try:
    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
        tqdm_notebook().pandas()
    else:
        tqdm.pandas()
except NameError:
    tqdm.pandas()




#### Load raw data

In [79]:
df_train = pd.read_csv('data/train_data.csv')
df_test = pd.read_csv('data/test_data.csv')
df_train_labels = pd.read_csv('data/train_labels.csv')

In [80]:
df_train = df_train.drop('is_duplicate', axis = 1)
df_train = df_train.merge(df_train_labels)

#Drop rows with missing Qs
df_train = df_train.dropna(axis=0, how='any')


#### Checklist:
- Last Character (Question more likely to be duplicate if they end with same punctuation mark
- Average Shared Words - Words shared by question pairs (more words shared = higher chance of duplicate)
- Shared Entities (Enzo-engineered entities using spaCy)
- Length (word count) - Words per sentence (Questions with different lengths they are unlikely to be duplicates) 
- Levenshtein Features - How many characters difference between two questions (more functions required = more dissimilar)
- Tf-Idf 
- LDA (topic modelling)


https://www.linkedin.com/pulse/kaggle-quora-question-pairs-mar-2017-may-priscilla-li/

## Convenience Method
Rerun below cell, if you make an adjustment to any of the feature methods. Look at examples below for implementation

##### TODO:
- parameter for dataframe or numpy representation
- add padding for word2vec vectors

In [17]:
FEATURE_FUNCTIONS = {
    'last_char': get_last_char,
    'avg_shared_words': get_shared_words,
    'shared_entities': get_shared_entities,
    'word_count_diff': get_word_count_diff,
    'levenshtein': get_levenshtein_distance,
}

def get_features(df, feature_list):
    """
    Convenience method to extract text features in the same way independent of the 
    dataframe. Ideally this method will be called with a list of feature names as
    a list, which will also work as column names to limit any dataframe to the relevant
    features.
    
    Hint: As pandas dataframes are merely pointers, we can get away with not returning
          the manipulated dataframe as every instance of df will already be affected.
    
    Parameters
        df: dataframe the features will be extracted from
        feature_list: list object, containing all features as strings
        
    Returns
        none: dataframe object is already manipulated and is not needed to be passed back
    """
    for feature in feature_list:
        logging.info('getting {}'.format(feature))
        FEATURE_FUNCTIONS[feature](df)
    logging.info('feature extraction done')

In [26]:
# === Example 1 ===
get_features(df_test, ['last_char', 'avg_shared_words', 'word_count_diff', 'shared_entities', 'levenshtein'])
df_test.head()

INFO:root:getting last_char
INFO:root:getting avg_shared_words


INFO:root:getting word_count_diff





INFO:root:getting shared_entities
INFO:root:getting levenshtein





INFO:root:feature extraction done





Unnamed: 0,test_id,question1,question2,last_char,avg_shared_words,word_count_diff,levenshtein
0,15,What would a Trump presidency mean for current...,How will a Trump presidency affect the student...,1,4,2,0.505376
1,20,Why do rockets look white?,Why are rockets and boosters painted white?,1,3,2,0.637681
2,21,What's causing someone to be jealous?,What can I do to avoid being jealous of someone?,1,1,4,0.447059
3,23,How much is 30 kV in HP?,Where can I find a conversion chart for CC to ...,1,0,4,0.074074
4,34,What is the best travel website in spain?,What is the best travel website?,1,5,2,0.876712


In [142]:
# === Example 2 ===
df_ents = pd.read_csv('data/test_with_sim_and_ents.csv')
get_features(df_ents, ['shared_entities'])
df_ents.head()

INFO:root:getting shared_entities
INFO:root:converting entity 1 string representation to dictionaries


INFO:root:converting entity 2 string representation to dictionaries


INFO:root:looking up shared entities


INFO:root:feature extraction done


Unnamed: 0.1,Unnamed: 0,test_id,question1,question2,entities1,entities2,similarity_score,shared_entities
0,0,15,invalid,How will a Trump presidency affect the student...,,"{u'ORG': 1, u'GPE': 2}",0.235347,0
1,1,20,Why do rockets look white?,Why are rockets and boosters painted white?,,,0.89227,0
2,2,21,What's causing someone to be jealous?,What can I do to avoid being jealous of someone?,,,0.956441,0
3,3,23,How much is 30 kV in HP?,Where can I find a conversion chart for CC to ...,"{u'PRODUCT': 1, u'QUANTITY': 1}",{u'ORG': 1},0.835993,0
4,4,34,What is the best travel website in spain?,What is the best travel website?,{u'GPE': 1},,0.968766,0


## Features

### Last Character

In [9]:
def get_last_char(df):
    df['last_char'] = np.where(df.question1.str[-1:] == df.question2.str[-1:], 1, 0)

### Avg. Shared Words

In [10]:
def shared_word_count(q1, q2):
    return len([w for w in lower_list(q1.split(' ')) if w in lower_list(q2.split(' '))])

def get_shared_words(df):
    df['avg_shared_words'] = df.progress_apply(lambda row: shared_word_count(row['question1'], 
                                                                             row['question2']), axis=1)

### Shared Entities

In [11]:
ENTITIES = {u'CARDINAL', u'DATE', u'EVENT', u'FAC', u'GPE', u'LANGUAGE', u'LAW', u'LOC', u'MONEY', u'NORP', u'ORDINAL',
            u'ORG', u'PERCENT', u'PERSON', u'PRODUCT', u'QUANTITY', u'TIME', u'WORK_OF_ART'}

In [12]:
def shared_keys(d1, d2):
    d1 = {} if not isinstance(d1, dict) else d1
    d2 = {} if not isinstance(d2, dict) else d2
    return len([w for w in d1.keys() if w in d2.keys()])

def get_shared_entities(df):
    if 'entities1' not in df.columns or 'entities2' not in df.columns:
        logging.warning('No entity dicts found in provided dataframe')
        return
    if isinstance(df.entities1[df.entities1.first_valid_index()], str):
        logging.info('converting entity 1 string representation to dictionaries')
        df.entities1 = df.entities1.progress_apply(lambda x: string_to_dict(x))
    if isinstance(df.entities2[df.entities2.first_valid_index()], str):
        logging.info('converting entity 2 string representation to dictionaries')
        df.entities2 = df.entities2.progress_apply(lambda x: string_to_dict(x))
    
    logging.info('looking up shared entities')
    df['shared_entities'] = df.progress_apply(lambda row: shared_keys(row['entities1'], 
                                                                      row['entities2']), axis=1)

### Word Count Diff

In [13]:
def count_diff(q1, q2):
    return abs(len(q1.split(' ')) - len(q2.split(' ')))

def get_word_count_diff(df):
    df['word_count_diff'] = df.progress_apply(lambda row: count_diff(row['question1'], 
                                                                     row['question2']), axis=1)

### Levenshtein Distance

In [14]:
def get_levenshtein_distance(df):
    # TODO: discuss if spaces should be considered in the distance calculation, add 'lambda x: x == " "' as first parameter to exclude spaces
    df['levenshtein'] = df.progress_apply(lambda row: difflib.SequenceMatcher(None, 
                                                                              row['question1'], 
                                                                              row['question2']).ratio(), axis=1)

## Helpers

In [27]:
def lower_list(_list):
    return [x.lower() for x in _list]

In [28]:
def string_to_dict(dict_string):
    if isinstance(dict_string, str):
        # Convert to proper json format
        dict_string = dict_string.replace("'", '"').replace('u"', '"')
        return json.loads(dict_string)

## Export

In [81]:
#train_data
get_features(df_train, ['last_char', 'avg_shared_words', 'word_count_diff', 'shared_entities', 'levenshtein'])
df_train.to_csv("train_features.csv", index=False)
#test_data
get_features(df_test, ['last_char', 'avg_shared_words', 'word_count_diff', 'shared_entities', 'levenshtein'])
df_test.to_csv("test_features.csv", index=False)

INFO:root:getting last_char
INFO:root:getting avg_shared_words


INFO:root:getting word_count_diff


INFO:root:getting shared_entities
INFO:root:getting levenshtein


INFO:root:feature extraction done
INFO:root:getting last_char
INFO:root:getting avg_shared_words


INFO:root:getting word_count_diff


INFO:root:getting shared_entities
INFO:root:getting levenshtein


INFO:root:feature extraction done
