### Feature Engineering

In [0]:
!pip install fuzzywuzzy
!pip install python-Levenshtein

!pip install gensim
!pip install pyemd



In [0]:
!wget 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'

--2018-12-13 16:54:07--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.224.195
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.224.195|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2018-12-13 16:54:45 (41.8 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [0]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import nltk
from nltk import ngrams, bigrams
import gensim
from nltk.corpus import stopwords
from collections import Counter

from functools import partial
from nltk import word_tokenize
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

In [0]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
train_df = pd.read_csv('cleaned_train_data.csv')
test_df = pd.read_csv('cleaned_test_data.csv')

Create feature dataframes

In [0]:
train_features = pd.DataFrame()
test_features = pd.DataFrame()

train_features['id'] = train_df['id']
test_features['test_id'] = test_df['test_id']

## Basic Features

In [0]:
train_features['len_q1'] = train_df['question1'].apply(lambda x: len(str(x)))
train_features['len_q2'] = train_df['question2'].apply(lambda x: len(str(x)))
train_features['len_diff'] = train_features.len_q1 - train_features.len_q2
train_features['len_char_q1'] = train_df['question1'].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
train_features['len_char_q2'] = train_df['question2'].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
train_features['len_char_diff'] = train_features.len_char_q1 - train_features.len_char_q2
train_features['len_word_q1'] = train_df['question1'].apply(lambda x: len(str(x).split()))
train_features['len_word_q2'] = train_df['question2'].apply(lambda x: len(str(x).split()))
train_features['intersection'] = train_df.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)

train_features['avg_wordlength_q1'] = train_features['len_char_q1'] / train_features['len_word_q1']
train_features['avg_wordlength_q2'] = train_features['len_char_q2'] / train_features['len_word_q2']
train_features['avg_worddiff'] = abs(train_features['avg_wordlength_q1'] - train_features['avg_wordlength_q2'])

train_features['dup_q1'] = train_df['question1'].duplicated()
train_features['dup_q2'] = train_df['question2'].duplicated()
train_features['dup_q1q2'] = train_features['dup_q1'] & train_features['dup_q2']

In [0]:
test_features['len_q1'] = test_df['question1'].apply(lambda x: len(str(x)))
test_features['len_q2'] = test_df['question2'].apply(lambda x: len(str(x)))
test_features['len_diff'] = test_features.len_q1 - test_features.len_q2
test_features['len_char_q1'] = test_df['question1'].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
test_features['len_char_q2'] = test_df['question2'].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
test_features['len_char_diff'] = test_features.len_char_q1 - test_features.len_char_q2
test_features['len_word_q1'] = test_df['question1'].apply(lambda x: len(str(x).split()))
test_features['len_word_q2'] = test_df['question2'].apply(lambda x: len(str(x).split()))
test_features['intersection'] = test_df.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)

test_features['avg_wordlength_q1'] = test_features['len_char_q1'] / test_features['len_word_q1']
test_features['avg_wordlength_q2'] = test_features['len_char_q2'] / test_features['len_word_q2']
test_features['avg_worddiff'] = abs(test_features['avg_wordlength_q1'] - test_features['avg_wordlength_q2'])

test_features['dup_q1'] = test_df['question1'].duplicated()
test_features['dup_q2'] = test_df['question2'].duplicated()
test_features['dup_q1q2'] = test_features['dup_q1'] & test_features['dup_q2']

## Fuzzywuzzy Features

In [0]:
train_features['fuzz_qratio'] = train_df.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
train_features['fuzz_WRatio'] = train_df.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
train_features['fuzz_partial_ratio'] = train_df.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
train_features['fuzz_partial_token_set_ratio'] = train_df.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
train_features['fuzz_partial_token_sort_ratio'] = train_df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
train_features['fuzz_token_set_ratio'] = train_df.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
train_features['fuzz_token_sort_ratio'] = train_df.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

In [0]:
test_features['fuzz_qratio'] = test_df.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
test_features['fuzz_WRatio'] = test_df.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
test_features['fuzz_partial_ratio'] = test_df.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
test_features['fuzz_partial_token_set_ratio'] = test_df.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
test_features['fuzz_partial_token_sort_ratio'] = test_df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
test_features['fuzz_token_set_ratio'] = test_df.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
test_features['fuzz_token_sort_ratio'] = test_df.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

In [0]:
train_features.to_csv('final_train_basic_fuzz.csv', index=False)
test_features.to_csv('final_test_basic_fuzz.csv', index=False)

## Other Features

In [0]:
# Is the first word the same?
def shared_first_word(entry):
    q1_fw = entry['question1'].apply(lambda x: str(x).split()[0])
    q2_fw = entry['question2'].apply(lambda x: str(x).split()[0])
    
    return q1_fw == q2_fw
  
# Is the last word the same?
def shared_last_word(entry):
    q1_lw = entry['question1'].apply(lambda x: str(x).split()[len(str(x).split()) - 1])
    q2_lw = entry['question2'].apply(lambda x: str(x).split()[len(str(x).split()) - 1])

    q1_lw = entry['question1'].apply(lambda x: str(x).split()[len(str(x).split()) - 1])
    q2_lw = entry['question2'].apply(lambda x: str(x).split()[len(str(x).split()) - 1])
    
    return q1_lw == q2_lw

# Is both the first and last word the same?
def shared_first_last(entry):
    q1_fw = entry['question1'].apply(lambda x: str(x).split()[0])
    q2_fw = entry['question2'].apply(lambda x: str(x).split()[0])
    same_fw = q1_fw == q2_fw
    
    q1_lw = entry['question1'].apply(lambda x: str(x).split()[len(str(x).split()) - 1])
    q2_lw = entry['question2'].apply(lambda x: str(x).split()[len(str(x).split()) - 1])
    same_lw = q1_lw == q2_lw
    
    return same_fw & same_lw

# Do the questions have the same content?
def same_question(entry):
    q1 = entry['question1'].apply(lambda x: str(x).split())
    q2 = entry['question2'].apply(lambda x: str(x).split())
    
    return q1 == q2
 
# Intersection of twograms in the questions
def shared_2grams(entry):
    q1 = list(bigrams(str(entry['question1']).lower().split()))
    q2 = list(bigrams(str(entry['question2']).lower().split()))
    
    return len(set(q1).intersection(set(q2)))

# Get amount of shared 3-length ngrams in questions
def shared_3grams(entry):
    q1 = list(ngrams(str(entry['question1']).lower().split(), 3))
    q2 = list(ngrams(str(entry['question2']).lower().split(), 3))
    
    return len(set(q1).intersection(set(q2)))

In [0]:
train_features['shared_first_word'] = shared_first_word(train_df)
train_features['shared_last_word'] = shared_last_word(train_df)
train_features['shared_first_last'] = shared_first_last(train_df)
train_features['same_question'] = same_question(train_df)
train_features['shared_2grams'] = shared_2grams(train_df)
train_features['shared_3grams'] = shared_3grams(train_df)

In [0]:
test_features['shared_first_word'] = shared_first_word(test_df)
test_features['shared_last_word'] = shared_last_word(test_df)
test_features['shared_first_last'] = shared_first_last(test_df)
test_features['same_question'] = same_question(test_df)
test_features['shared_2grams'] = shared_2grams(test_df)
test_features['shared_3grams'] = shared_3grams(test_df)

In [0]:
train_features.shape

(323053, 28)

In [0]:
test_features.shape

(81126, 28)

In [0]:
train_features.to_csv('final_train_basic_fuzz.csv', index=False)
test_features.to_csv('final_test_basic_fuzz.csv', index=False)

## TF IDF Features

In [0]:
# Note: this code is not ours, due to lack of time we had to borrow it from:
# https://github.com/HouJP/kaggle-quora-question-pairs

class FeatureCreator(object):
  
    def __init__(self, df, q1_column='question1', q2_column='question2'):
        self.df = df
        self.q1_column = q1_column
        self.q2_column = q2_column
        self.stop_words = set(stopwords.words('english'))
        self.w2c_model = None

    def add_additional_features(self):
        """Compute TF-IDF and some other interesting features
        """
        self.df['q1_words'] = self.df[self.q1_column].map(lambda x: str(x).lower().split())
        self.df['q2_words'] = self.df[self.q2_column].map(lambda x: str(x).lower().split())
        questions = pd.Series(self.df['q1_words'].tolist() + self.df['q2_words'].tolist())
        words = [word for question in questions for word in question]
        word_count = Counter(words)
        weights = {word: self.compute_weight(count) for word, count in word_count.items()}
        
        # Add features
        self.df['tfidf_word_match'] = self.df.apply(
            partial(self.tfidf_word_match_share, weights=weights, ignore_stop_words=True), axis=1, raw=True
        )
        self.df['tfidf_word_match_stops'] = self.df.apply(
            partial(self.tfidf_word_match_share, weights=weights, ignore_stop_words=False), axis=1, raw=True
        )
        self.df['jaccard_similarity'] = self.df.apply(partial(self.jaccard_similarity), axis=1, raw=True)
        self.df['word_count_ratio'] = self.df.apply(partial(self.word_count_ratio), axis=1, raw=True)
        self.df['unique_word_count_ratio'] = self.df.apply(partial(self.unique_word_count_ratio), axis=1, raw=True)
        self.df['total_unique_words'] = self.df.apply(partial(self.total_unique_words), axis=1, raw=True)
        # Remove columns used for calculations
        self.df.drop('q1_words', axis=1, inplace=True)
        self.df.drop('q2_words', axis=1, inplace=True)

    @staticmethod
    def compute_weight(count, epsilon=10000, min_count=2):
        if count < min_count:
            return .0
        else:
            return 1.0 / (count + epsilon)

    @staticmethod
    def unique_word_count_ratio(row):
        l1 = float(len(set(row['q1_words'])))
        l2 = len(set(row['q2_words']))
        if l2 == 0:
            return np.nan
        if l1 / l2:
            return l2 / l1
        else:
            return l1 / l2

    def tfidf_word_match_share(self, row, weights, ignore_stop_words):
        q1_words = set()
        q2_words = set()
        if ignore_stop_words:
            for word in row['q1_words']:
                q1_words.add(word)
            for word in row['q2_words']:
                q2_words.add(word)
        else:
            for word in row['q1_words']:
                if word not in self.stop_words:
                    q1_words.add(word)
            for word in row['q2_words']:
                if word not in self.stop_words:
                    q2_words.add(word)
        if not q1_words or not q2_words:
            return .0
        shared_weights = np.sum([weights.get(word, .0) for word in q1_words if word in q2_words])
        shared_weights += np.sum([weights.get(word, .0) for word in q2_words if word in q1_words])
        total_weights = np.sum([weights.get(word, .0) for word in q1_words])
        total_weights += np.sum([weights.get(word, .0) for word in q2_words])
        return shared_weights / total_weights

    @staticmethod
    def jaccard_similarity(row):
        words_in_common = set(row['q1_words']).intersection(set(row['q2_words']))
        unique_words = set(row['q1_words']).union(row['q2_words'])
        if not unique_words:
            return 1.0
        return len(words_in_common) / float(len(unique_words))

    @staticmethod
    def word_count_ratio(row):
        l1 = float(len(row['q1_words']))
        l2 = len(row['q2_words'])
        if l2 == 0:
            return np.nan
        if l1 / l2:
            return l2 / l1
        else:
            return l1 / l2

    @staticmethod
    def total_unique_words(row):
        return len(set(row['q1_words']).union(row['q2_words']))

    def add_word2vec_features(self, model_path, model_name='w2v', vector_size=300):
        """ word2vec features require a lot of RAM to be computed
        """
        # Load model and compute Word Mover's Distance
        self.w2c_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
        self.w2c_model.init_sims(replace=True)
        self.df['{}_norm_wmd'.format(model_name)] = self.df.apply(
            lambda x: self.word_mover_distance(x['question1'], x['question2']), axis=1
        )
        self.w2c_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
        self.df['{}_wmd'.format(model_name)] = self.df.apply(
            lambda x: self.word_mover_distance(x['question1'], x['question2']), axis=1
        )
        # Generate vectors from questions
        question1_vectors = np.zeros((self.df.shape[0], vector_size))
        question2_vectors = np.zeros((self.df.shape[0], vector_size))
        j = 0
        for i, row in self.df.iterrows():
            question1_vectors[j, :] = self.text2vec(row[self.q1_column])
            question2_vectors[j, :] = self.text2vec(row[self.q2_column])
            j += 1
        self.w2c_model = None  # Save up some RAM
        # Compute several features using vectors
        self.df['{}_cosine_distance'.format(model_name)] = [
            cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))
        ]
        self.df['{}_cityblock_distance'.format(model_name)] = [
            cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))
        ]
        self.df['{}_jaccard_distance'.format(model_name)] = [
            jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))
        ]
        self.df['{}_canberra_distance'.format(model_name)] = [
            canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))
        ]
        self.df['{}_euclidean_distance'.format(model_name)] = [
            euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))
        ]
        self.df['{}_minkowski_distance'.format(model_name)] = [
            minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))
        ]
        self.df['{}_braycurtis_distance'.format(model_name)] = [
            braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))
        ]
        self.df['{}_skew_q1vec'.format(model_name)] = [skew(x) for x in np.nan_to_num(question1_vectors)]
        self.df['{}_skew_q2vec'.format(model_name)] = [skew(x) for x in np.nan_to_num(question2_vectors)]
        self.df['{}_kur_q1vec'.format(model_name)] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
        self.df['{}_kur_q2vec'.format(model_name)] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

        
    def word_mover_distance(self, text1, text2):
        text1 = [w for w in str(text1).lower().split() if w not in self.stop_words]
        text2 = [w for w in str(text2).lower().split() if w not in self.stop_words]
        return self.w2c_model.wmdistance(text1, text2)

    def text2vec(self, text):
        text = word_tokenize(str(text).lower())
        text = [w for w in text if w not in self.stop_words and w.isalpha()]
        matrix = []
        for w in text:
            try:
                matrix.append(self.w2c_model[w])
            except Exception as e:
                pass
        matrix = np.array(matrix)
        v = matrix.sum(axis=0)
        return v / np.sqrt((v ** 2).sum())

In [0]:
fc = FeatureCreator(train_df)
# fc.add_additional_features()
fc.add_word2vec_features('GoogleNews-vectors-negative300.bin.gz', 'GoogleNews')
print(list(train_df))

  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = np.double(unequal_nonzero.sum()) / np.double(nonzero.sum())
  return l1_diff.sum() / l1_sum.sum()


['id', 'question1', 'question2', 'is_duplicate', 'GoogleNews_norm_wmd', 'GoogleNews_wmd', 'GoogleNews_cosine_distance', 'GoogleNews_cityblock_distance', 'GoogleNews_jaccard_distance', 'GoogleNews_canberra_distance', 'GoogleNews_euclidean_distance', 'GoogleNews_minkowski_distance', 'GoogleNews_braycurtis_distance', 'GoogleNews_skew_q1vec', 'GoogleNews_skew_q2vec', 'GoogleNews_kur_q1vec', 'GoogleNews_kur_q2vec']


In [0]:
final_train_w2v = train_df[['id', 'GoogleNews_norm_wmd', 'GoogleNews_wmd', 'GoogleNews_cosine_distance', 'GoogleNews_cityblock_distance', 'GoogleNews_jaccard_distance', 'GoogleNews_canberra_distance', 'GoogleNews_euclidean_distance', 'GoogleNews_minkowski_distance', 'GoogleNews_braycurtis_distance', 'GoogleNews_skew_q1vec', 'GoogleNews_skew_q2vec', 'GoogleNews_kur_q1vec', 'GoogleNews_kur_q2vec']]
final_train_w2v.to_csv('final_train_w2v.csv', index=False)

In [0]:
fc = FeatureCreator(test_df)
fc.add_word2vec_features('GoogleNews-vectors-negative300.bin.gz', 'GoogleNews')
print(list(test_df))

  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = np.double(unequal_nonzero.sum()) / np.double(nonzero.sum())
  return l1_diff.sum() / l1_sum.sum()


['test_id', 'question1', 'question2', 'GoogleNews_norm_wmd', 'GoogleNews_wmd', 'GoogleNews_cosine_distance', 'GoogleNews_cityblock_distance', 'GoogleNews_jaccard_distance', 'GoogleNews_canberra_distance', 'GoogleNews_euclidean_distance', 'GoogleNews_minkowski_distance', 'GoogleNews_braycurtis_distance', 'GoogleNews_skew_q1vec', 'GoogleNews_skew_q2vec', 'GoogleNews_kur_q1vec', 'GoogleNews_kur_q2vec']


In [0]:
final_train_additional = train_df[['id', 'tfidf_word_match', 'tfidf_word_match_stops', 'jaccard_similarity', 'word_count_ratio', 'unique_word_count_ratio', 'total_unique_words']]
final_train_additional.to_csv('final_train_additional.csv', index=False)

In [0]:
final_test_additional = test_df[['test_id', 'tfidf_word_match', 'tfidf_word_match_stops', 'jaccard_similarity', 'word_count_ratio', 'unique_word_count_ratio', 'total_unique_words']]
final_test_additional.to_csv('final_test_additional.csv', index=False)

In [0]:
final_test_w2v = test_df[['test_id', 'GoogleNews_norm_wmd', 'GoogleNews_wmd', 'GoogleNews_cosine_distance', 'GoogleNews_cityblock_distance', 'GoogleNews_jaccard_distance', 'GoogleNews_canberra_distance', 'GoogleNews_euclidean_distance', 'GoogleNews_minkowski_distance', 'GoogleNews_braycurtis_distance', 'GoogleNews_skew_q1vec', 'GoogleNews_skew_q2vec', 'GoogleNews_kur_q1vec', 'GoogleNews_kur_q2vec']]
final_test_w2v.to_csv('final_test_w2v.csv', index=False)

## Merge all feature files

In [0]:
final_train_basic_fuzz = pd.read_csv('final_train_basic_fuzz.csv')
final_train_additional = pd.read_csv('final_train_additional.csv')
final_train_w2v = pd.read_csv('final_train_w2v.csv')

In [0]:
final_test_basic_fuzz = pd.read_csv('final_test_basic_fuzz.csv')
final_test_additional = pd.read_csv('final_test_additional.csv')
final_test_w2v = pd.read_csv('final_test_w2v.csv')

In [0]:
final_train_features = final_train_basic_fuzz.merge(final_train_additional, on='id')
final_train_features = final_train_features.merge(final_train_w2v, on='id')
final_train_features.to_csv('final_train_features.csv', index=False)

In [0]:
final_test_features = final_test_basic_fuzz.merge(final_test_additional, on='test_id')
final_test_features = final_test_features.merge(final_test_w2v, on='test_id')
final_test_features.to_csv('final_test_features.csv', index=False)