In [2]:
import difflib
import json
import logging
import string

import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook

from nltk import ngrams
from nltk.corpus import stopwords

In [3]:
# init logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# init tqdm
try:
    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
        tqdm_notebook().pandas()
    else:
        tqdm.pandas()
except NameError:
    tqdm.pandas()
    
# init stopwords
STOPWORDS = set(stopwords.words('english'))




#### Load raw data

In [5]:
df_train = pd.read_csv('../data/train_data.csv')
df_test = pd.read_csv('../data/test_data.csv')
df_train_labels = pd.read_csv('../data/train_labels.csv')

In [6]:
df_train = df_train.drop('is_duplicate', axis = 1)
df_train = df_train.merge(df_train_labels)

#Drop rows with missing Qs
df_train = df_train.dropna(axis=0, how='any')

#### Checklist:
- Last Character (Question more likely to be duplicate if they end with same punctuation mark
- Average Shared Words - Words shared by question pairs (more words shared = higher chance of duplicate)
- Shared Entities (Enzo-engineered entities using spaCy)
- Length (word count) - Words per sentence (Questions with different lengths they are unlikely to be duplicates) 
- Levenshtein Features - How many characters difference between two questions (more functions required = more dissimilar)
- Tf-Idf 
- LDA (topic modelling)


https://www.linkedin.com/pulse/kaggle-quora-question-pairs-mar-2017-may-priscilla-li/

## Convenience Method
Rerun below cell, if you make an adjustment to any of the feature methods. Look at examples below for implementation

In [20]:
FEATURE_FUNCTIONS = {
    'last_char': get_last_char,
    'avg_shared_words': get_shared_words,
    'shared_word_percent': shared_word_pcnt,
    'shared_bigrams': get_shared_bigrams,
    'shared_bigrams_percent': get_shared_bigrams_pcnt,
    'shared_trigrams': get_shared_trigrams,
    'shared_trigrams_percent': get_shared_trigrams_pcnt,
    'shared_quadgrams': get_shared_quadgrams,
    'shared_quadgrams_percent': get_shared_quadgrams_pcnt,
    'shared_entities': get_shared_entities,
    'word_count_diff': get_word_count_diff,
    'levenshtein': get_levenshtein_distance,
    'get_tfidf': get_tfidf,
}

def get_features(df, feature_list):
    """
    Convenience method to extract text features in the same way independent of the 
    dataframe. Ideally this method will be called with a list of feature names as
    a list, which will also work as column names to limit any dataframe to the relevant
    features.
    
    Hint: As pandas dataframes are merely pointers, we can get away with not returning
          the manipulated dataframe as every instance of df will already be affected.
    
    Parameters
        df: dataframe the features will be extracted from
        feature_list: list object, containing all features as strings
        
    Returns
        none: dataframe object is already manipulated and is not needed to be passed back
    """
    for feature in feature_list:
        logging.info('getting {}'.format(feature))
        FEATURE_FUNCTIONS[feature](df)
    logging.info('feature extraction done')

In [153]:
# === Example 1 ===
get_features(df_test, ['last_char', 'avg_shared_words', 'word_count_diff', 'shared_entities', 'levenshtein', 'shared_word_percent'])
df_test.head()

INFO:root:getting shared_quadgrams


INFO:root:getting shared_quadgrams_percent





INFO:root:feature extraction done





Unnamed: 0,test_id,question1,question2,avg_shared_bigrams,shared_bigram_pcnt,avg_shared_trigrams,shared_trigram_pcnt,avg_shared_quadgrams,shared_quadgram_pcnt
0,15,What would a Trump presidency mean for current...,How will a Trump presidency affect the student...,2,0.133333,1,0.071429,0,0.0
1,20,Why do rockets look white?,Why are rockets and boosters painted white?,0,0.0,0,0.0,0,0.0
2,21,What's causing someone to be jealous?,What can I do to avoid being jealous of someone?,0,0.0,0,0.0,0,0.0
3,23,How much is 30 kV in HP?,Where can I find a conversion chart for CC to ...,0,0.0,0,0.0,0,0.0
4,34,What is the best travel website in spain?,What is the best travel website?,4,0.666667,3,0.6,2,0.5


In [142]:
# === Example 2 ===
df_ents = pd.read_csv('data/test_with_sim_and_ents.csv')
get_features(df_ents, ['shared_entities'])
df_ents.head()

INFO:root:getting shared_entities
INFO:root:converting entity 1 string representation to dictionaries


INFO:root:converting entity 2 string representation to dictionaries


INFO:root:looking up shared entities


INFO:root:feature extraction done


Unnamed: 0.1,Unnamed: 0,test_id,question1,question2,entities1,entities2,similarity_score,shared_entities
0,0,15,invalid,How will a Trump presidency affect the student...,,"{u'ORG': 1, u'GPE': 2}",0.235347,0
1,1,20,Why do rockets look white?,Why are rockets and boosters painted white?,,,0.89227,0
2,2,21,What's causing someone to be jealous?,What can I do to avoid being jealous of someone?,,,0.956441,0
3,3,23,How much is 30 kV in HP?,Where can I find a conversion chart for CC to ...,"{u'PRODUCT': 1, u'QUANTITY': 1}",{u'ORG': 1},0.835993,0
4,4,34,What is the best travel website in spain?,What is the best travel website?,{u'GPE': 1},,0.968766,0


## Features

### Last Character

In [7]:
def get_last_char(df):
    df['last_char'] = np.where(df.question1.str[-1:] == df.question2.str[-1:], 1, 0)

### Avg. Shared Words

In [8]:
# TODO: remove punctuation

def shared_word_count(q1, q2):
    return len([w for w in lower_list(q1.split(' ')) if w in lower_list(q2.split(' '))])

def get_shared_words(df):
    df['avg_shared_words'] = df.progress_apply(lambda row: shared_word_count(row['question1'],                                                                         
                                                                             row['question2']), axis=1)
    
# Shared word percentage (0-100%)   
def avg_word_count(q1, q2):
    return (len(lower_list(q1.split(' '))) + len(lower_list(q2.split(' '))))/2.0
    
def shared_word_pcnt(df):
    df['shared_words_pcnt'] = df.progress_apply(lambda row: shared_word_count(row['question1'], row['question2']) / avg_word_count(row['question1'], row['question2']), axis=1)    
    

### Shared ngrams

In [9]:
def shared_ngram_count(q1, q2, n):
    return len([w for w in ngrams(lower_list(q1.split(' ')), n) if w in ngrams(lower_list(q2.split(' ')), n)])

def get_shared_ngrams(df, n):
    df['avg_shared_{}grams'.format(n)] = df.progress_apply(lambda row: shared_bigram_count(row['question1'], row['question2']), axis=1)

# percentage
def ngram_count(q1, q2, n):
    cnt = ((len([x for x in ngrams(lower_list(q1.split(' ')), n)]) + len([x for x in ngrams(lower_list(q2.split(' ')), n)])) / 2.0)
    return cnt if cnt > 0 else 1
            
def shared_ngram_pcnt(df, n):
    df['shared_{}grams_pcnt'.format(n)] = df.progress_apply(lambda row: shared_ngram_count(row['question1'], row['question2'], n) / ngram_count(row['question1'], row['question2'], n), axis=1) 

In [10]:
def get_shared_bigrams(df):
    get_shared_ngrams(df, 2)

def get_shared_bigrams_pcnt(df):
    shared_ngram_pcnt(df, 2)
    
def get_shared_trigrams(df):
    get_shared_ngrams(df, 3)

def get_shared_trigrams_pcnt(df):
    shared_ngram_pcnt(df, 3)
    
def get_shared_quadgrams(df):
    get_shared_ngrams(df, 4)

def get_shared_quadgrams_pcnt(df):
    shared_ngram_pcnt(df, 4)

### Shared Entities

In [11]:
ENTITIES = {u'CARDINAL', u'DATE', u'EVENT', u'FAC', u'GPE', u'LANGUAGE', u'LAW', u'LOC', u'MONEY', u'NORP', u'ORDINAL',
            u'ORG', u'PERCENT', u'PERSON', u'PRODUCT', u'QUANTITY', u'TIME', u'WORK_OF_ART'}

In [12]:
# TODO: Shared entities with half counts
# TODO: Name the entities, if type matches, but instance is different it will be a good indicator

def shared_keys(d1, d2):
    d1 = {} if not isinstance(d1, dict) else d1
    d2 = {} if not isinstance(d2, dict) else d2
    return len([w for w in d1.keys() if w in d2.keys()])

def non_shared_keys(d1, d2):
    d1 = {} if not isinstance(d1, dict) else d1
    d2 = {} if not isinstance(d2, dict) else d2
    q1s = len([w for w in d1.keys() if w not in d2.keys()])
    q2s = len([w for w in d2.keys() if w not in d1.keys()])
    return q1s + q2s

def get_shared_entities(df):
    if 'entities1' not in df.columns or 'entities2' not in df.columns:
        logging.warning('No entity dicts found in provided dataframe')
        return
    if isinstance(df.entities1[df.entities1.first_valid_index()], str):
        logging.info('converting entity 1 string representation to dictionaries')
        df.entities1 = df.entities1.progress_apply(lambda x: string_to_dict(x))
    if isinstance(df.entities2[df.entities2.first_valid_index()], str):
        logging.info('converting entity 2 string representation to dictionaries')
        df.entities2 = df.entities2.progress_apply(lambda x: string_to_dict(x))
    
    logging.info('looking up shared entities')
    df['shared_entities'] = df.progress_apply(lambda row: shared_keys(row['entities1'], 
                                                                      row['entities2']), axis=1)
    df['non_shared_entities'] = df.progress_apply(lambda row: non_shared_keys(row['entities1'], 
                                                                              row['entities2']), axis=1)

### Word Count Diff

In [13]:
def count_diff(q1, q2):
    return abs(len(q1.split(' ')) - len(q2.split(' ')))

def get_word_count_diff(df):
    df['word_count_diff'] = df.progress_apply(lambda row: count_diff(row['question1'], 
                                                                     row['question2']), axis=1)

### Levenshtein Distance

In [14]:
def get_levenshtein_distance(df):
    # TODO: discuss if spaces should be considered in the distance calculation, add 'lambda x: x == " "' as first parameter to exclude spaces
    df['levenshtein'] = df.progress_apply(lambda row: difflib.SequenceMatcher(None, 
                                                                              row['question1'], 
                                                                              row['question2']).ratio(), axis=1)

## Tf-Idf

In [15]:
# FIXME: code mostly copied, only adjusted the punctuation, stopwords and empty strings

from collections import Counter
def get_weights(df):
# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
    def get_weight(count, eps=10000, min_count=2):
        if count < min_count:
            return 0.0
        else:
            return 1.0 / (count + eps)

    eps = 5000 
    words = (" ".join(list(df.question1) + list(df.question2))).lower().split()
    words = [remove_punctuation(w) for w in words]
    words = [w if w not in STOPWORDS else '' for w in words]
    words = filter(None, words)  # filter out empty strings
    counts = Counter(words)
    weights = {word: get_weight(count, eps) for word, count in counts.items()}
    return weights
    # print('Most common words and weights: \n')
    # print(sorted(weights.items(), key=lambda x: x[1] if x[1] > 0 else 9999)[:10])
    # print('\nLeast common words and weights: ')
    # (sorted(weights.items(), key=lambda x: x[1], reverse=True)[:10])

In [16]:
# FIXME: code copied

def tfidf_word_match_share(row, weights):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in STOPWORDS:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in STOPWORDS:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
 
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

def get_tfidf(df):
    weights = get_weights(df)
    df['tfidf_word_match_share'] = df.progress_apply(lambda row: tfidf_word_match_share(row, weights), axis=1)

## Helpers

In [17]:
def lower_list(_list):
    return [x.lower() for x in remove_punctuation(_list)]

In [18]:
def string_to_dict(dict_string):
    if isinstance(dict_string, str):
        # Convert to proper json format
        dict_string = dict_string.replace("'", '"').replace('u"', '"')
        return json.loads(dict_string)

In [19]:
def remove_punctuation(string_):
    return string_.translate(None, string.punctuation)

## Export

In [21]:
methods_ = [
#    'last_char',
#    'avg_shared_words',
#    'word_count_diff',
   'shared_entities',
#    'levenshtein',
#    'shared_word_percent',
#    'shared_bigrams',
#    'shared_bigrams_percent',
#    'shared_trigrams',
#    'shared_trigrams_percent',
#    'shared_quadgrams',
#    'shared_quadgrams_percent',
#    'get_tfidf',
]

In [None]:
# train_data
get_features(df_train, methods_)
df_train.to_csv("tfidf_train_features.csv", index=False)

# test_data
get_features(df_test, methods_)
df_test.to_csv("tfidf_test_features.csv", index=False)

INFO:root:getting get_tfidf


INFO:root:feature extraction done





INFO:root:getting get_tfidf


### Add entity features to exported csv

In [41]:
df_train = pd.read_csv('../data/train_features.csv')
df_train.describe()

(323162, Index([u'id', u'question1', u'question2', u'is_duplicate', u'last_char',
       u'avg_shared_words', u'word_count_diff', u'levenshtein',
       u'shared_words_pcnt', u'avg_shared_trigrams', u'shared_bigram_pcnt',
       u'shared_trigram_pcnt', u'avg_shared_quadgrams',
       u'shared_quadgram_pcnt', u'shared_entities', u'non_shared_entities'],
      dtype='object'))


Unnamed: 0,id,is_duplicate,last_char,avg_shared_words,word_count_diff,levenshtein,shared_words_pcnt,avg_shared_trigrams,shared_bigram_pcnt,shared_trigram_pcnt,avg_shared_quadgrams,shared_quadgram_pcnt
count,323162.0,323162.0,323162.0,323162.0,323162.0,323162.0,323162.0,323162.0,323162.0,323162.0,323162.0,323162.0
mean,202197.35813,0.368834,0.980159,4.790084,3.699067,0.579246,0.451941,1.36564,0.246312,0.153666,0.875412,0.102417
std,116794.725695,0.482489,0.139455,3.451633,4.842665,0.219205,0.257909,2.661596,0.25978,0.23907,2.310736,0.214343
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,100962.25,0.0,1.0,2.0,1.0,0.40367,0.25,0.0,0.0,0.0,0.0,0.0
50%,202164.5,0.0,1.0,4.0,2.0,0.582278,0.444444,0.0,0.166667,0.0,0.0,0.0
75%,303538.75,1.0,1.0,6.0,5.0,0.756757,0.645161,2.0,0.4,0.235294,1.0,0.08
max,404289.0,1.0,1.0,61.0,223.0,1.0,1.333333,52.0,1.297297,1.257143,48.0,1.212121


In [42]:
df_train_ents = pd.read_csv('../data/train_with_sim_and_ents_long.csv')
get_features(df_train_ents, methods_)
df_train = df_train.merge(df_train_ents.loc[:,['id','shared_entities','non_shared_entities']] , on='id', how='left')
df_train.shared_entities = df_train.shared_entities.fillna(0).astype(int)
df_train.non_shared_entities = df_train.non_shared_entities.fillna(0).astype(int)
df_train.to_csv('../data/train_features.csv')
df_train.describe()

Unnamed: 0,id,is_duplicate,last_char,avg_shared_words,word_count_diff,levenshtein,shared_words_pcnt,avg_shared_trigrams,shared_bigram_pcnt,shared_trigram_pcnt,avg_shared_quadgrams,shared_quadgram_pcnt,shared_entities,non_shared_entities
count,323162.0,323162.0,323162.0,323162.0,323162.0,323162.0,323162.0,323162.0,323162.0,323162.0,323162.0,323162.0,323162.0,323162.0
mean,202197.35813,0.368834,0.980159,4.790084,3.699067,0.579246,0.451941,1.36564,0.246312,0.153666,0.875412,0.102417,0.384142,0.577404
std,116794.725695,0.482489,0.139455,3.451633,4.842665,0.219205,0.257909,2.661596,0.25978,0.23907,2.310736,0.214343,0.586787,0.871758
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,100962.25,0.0,1.0,2.0,1.0,0.40367,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,202164.5,0.0,1.0,4.0,2.0,0.582278,0.444444,0.0,0.166667,0.0,0.0,0.0,0.0,0.0
75%,303538.75,1.0,1.0,6.0,5.0,0.756757,0.645161,2.0,0.4,0.235294,1.0,0.08,1.0,1.0
max,404289.0,1.0,1.0,61.0,223.0,1.0,1.333333,52.0,1.297297,1.257143,48.0,1.212121,6.0,8.0


In [43]:
df_test = pd.read_csv('../data/test_features.csv')
df_test.describe()

Unnamed: 0,test_id,last_char,avg_shared_words,word_count_diff,levenshtein,shared_words_pcnt,avg_shared_trigrams,shared_bigram_pcnt,shared_trigram_pcnt,avg_shared_quadgrams,shared_quadgram_pcnt
count,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0
mean,201935.133447,0.980426,4.784866,3.684787,0.578928,0.452134,1.358985,0.246322,0.153545,0.870116,0.102297
std,116366.394811,0.138534,3.449371,4.825042,0.218786,0.257994,2.654754,0.260056,0.239421,2.303912,0.214581
min,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,101527.0,1.0,3.0,1.0,0.404145,0.25,0.0,0.0,0.0,0.0,0.0
50%,202071.5,1.0,4.0,2.0,0.581197,0.444444,0.0,0.166667,0.0,0.0,0.0
75%,301947.75,1.0,6.0,5.0,0.755556,0.647059,2.0,0.4,0.235294,1.0,0.08
max,404278.0,1.0,50.0,213.0,1.0,1.285714,42.0,1.166667,1.0,40.0,1.0


In [50]:
df_test_ents = pd.read_csv('../data/test_with_sim_and_ents_long.csv')
get_features(df_test_ents, methods_)
df_test = df_test.merge(df_test_ents.loc[:,['test_id','shared_entities','non_shared_entities']] , on='test_id', how='left')
df_test.shared_entities = df_test.shared_entities.fillna(0).astype(int)
df_test.non_shared_entities = df_test.non_shared_entities.fillna(0).astype(int)
df_test.to_csv('../data/train_features.csv')
df_test.describe()

Unnamed: 0,test_id,last_char,avg_shared_words,word_count_diff,levenshtein,shared_words_pcnt,avg_shared_trigrams,shared_bigram_pcnt,shared_trigram_pcnt,avg_shared_quadgrams,shared_quadgram_pcnt,shared_entities,non_shared_entities
count,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0,81126.0
mean,201935.133447,0.980426,4.784866,3.684787,0.578928,0.452134,1.358985,0.246322,0.153545,0.870116,0.102297,0.386572,0.576622
std,116366.394811,0.138534,3.449371,4.825042,0.218786,0.257994,2.654754,0.260056,0.239421,2.303912,0.214581,0.586796,0.869147
min,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,101527.0,1.0,3.0,1.0,0.404145,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,202071.5,1.0,4.0,2.0,0.581197,0.444444,0.0,0.166667,0.0,0.0,0.0,0.0,0.0
75%,301947.75,1.0,6.0,5.0,0.755556,0.647059,2.0,0.4,0.235294,1.0,0.08,1.0,1.0
max,404278.0,1.0,50.0,213.0,1.0,1.285714,42.0,1.166667,1.0,40.0,1.0,5.0,7.0
