## Build up data set with relevant metrics for string matching

In [19]:
# Import modules
from __future__ import division

import os
import sys
import pandas as pd
import numpy as np
import jellyfish
import string
import collections
import re
import nltk.corpus
import nltk.tokenize.punkt
import nltk.stem.snowball
import ngram

from pyxdameraulevenshtein import normalized_damerau_levenshtein_distance as normed_dm_dist
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import wordpunct_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [2]:
# Set pandas options for convenience
pd.options.display.max_columns=10000

In [21]:
# Import data.
df = pd.read_csv('matched_strings.csv', header=0, encoding='utf8')

In [22]:
df.head()

Unnamed: 0,fuzzy_match_show_name,fuzzy_match_show_name_2,is_correct
0,#THROWBACKTHURSDAY,THROWBACKTHURSDAY VIDEOS,1
1,10 THINGS HATE ABOUT YO,10 THINGS HATE ABOUT YOU,1
2,10 THINGS HATE ABOUT YOU,10 THINGS HATE ABOUT YOU,1
3,10 THINGS HATE ABOUT YOU,WILD THINGS,0
4,100 GREATEST KIDS STARS,THE GREATEST,0


## building out features

In [5]:
def make_tokens(str_1, str_2):
    """
    Helper function for making tokens out of string str_1 and str_2
    """
    tokens_a = [token.lower().strip(string.punctuation) for token in tokenizer(str_1) if 
               token.lower().strip(string.punctuation)]
    tokens_b = [token.lower().strip(string.punctuation) for token in tokenizer(str_2) if 
               token.lower().strip(string.punctuation)]
        
    return tokens_a, tokens_b

In [6]:
def make_stems(tokens_a, tokens_b):
    """
    Makes stems out of tokens tokens_a and tokens_b
    """
    stem_a = [stemmer.stem(token) for token in tokens_a]
    stem_b = [stemmer.stem(token) for token in tokens_b]
    
    return stem_a, stem_b

In [7]:
def jaccard_similarity(str_1, str_2):
    """
    Implements Jaccard similarity between input strings str_1, str_2
    """
    tokens_a, tokens_b = make_tokens(str_1, str_2)
    stem_a, stem_b = make_stems(tokens_a, tokens_b)
        
    return len(set(stem_a).intersection(set(stem_b))) / float(len(set(stem_a).union(set(stem_b))))

In [30]:
def phonetic_matching_str(str_1, str_2):
    """
    Computes phonetic distance between input strings
    """
    tokens_a, tokens_b = make_tokens(str_1, str_2)
    stem_a, stem_b = make_stems(tokens_a, tokens_b)
    
    phonetic_distance = jellyfish.jaro_distance(unicode(jellyfish.metaphone(unicode(' '.join(stem_a)))), 
                            unicode(jellyfish.metaphone(unicode(' '.join(stem_b)))))
    return phonetic_distance

In [9]:
def n_gram_matching_str(str_1, str_2):
    """
    Implements trigram distance between strings str_1 and str_2
    """
    tokens_a, tokens_b = make_tokens(str_1, str_2)
    
    joined_a = ' '.join(tokens_a)
    joined_b = ' '.join(tokens_b)
    # TODO: Generalize to other N-grams
    trigram_compare = ngram.NGram.compare(joined_a, joined_b, N=3)
    
    return trigram_compare

In [10]:
def DL_distance(str_1, str_2):
    """
    Computes DL distance between strings str_1 and str_2
    """
    tokens_a, tokens_b = make_tokens(str_1, str_2)
    stem_a, stem_b = make_stems(tokens_a, tokens_b)
    
    DL_sim = 1 - normed_dm_dist(' '.join(stem_a), ' '.join(stem_b))
    
    return DL_sim

In [17]:
def fuzzy_distance(str_1, str_2):
    """
    Returns the conventional fuzzy match distance between two strings.
    """
    tokens_a, tokens_b = make_tokens(str_1, str_2)
    stem_a, stem_b = make_stems(tokens_a, tokens_b)
    
    fuzzy_dist = fuzz.ratio(' '.join(stem_a), ' '.join(stem_b)) / 100
    return fuzzy_dist

In [11]:
def tokenizer(str_):
    """
    Function to define tokenizer
    """
    tokens = nltk.wordpunct_tokenize(str_)
    return tokens

In [12]:
# Define stopwords
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stopwords.append('')

In [13]:
# Define lemmatizer and stemmer
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
stemmer = nltk.snowball.SnowballStemmer('english')

## Build out the features on the original input dataset.

In [23]:
df.rename(index=str, columns={'fuzzy_match_show_name': 'str_1', 
                              'fuzzy_match_show_name_2': 'str_2'}, inplace=True)

In [24]:
df['fuzzy_similarity'] = df.apply(lambda row: fuzzy_distance(str_1=row['str_1'], 
                                                             str_2=row['str_2']), axis=1)

In [26]:
df['DL_similarity'] = df.apply(lambda row: DL_distance(str_1=row['str_1'], 
                                                       str_2=row['str_2']), axis=1)

In [27]:
df['jaccard_similarity'] = df.apply(lambda row: jaccard_similarity(str_1=row['str_1'],
                                                                  str_2=row['str_2']), axis=1)

In [28]:
df['ngram_compare'] = df.apply(lambda row: n_gram_matching_str(str_1=row['str_1'], 
                                                               str_2=row['str_2']), axis=1)

In [31]:
df['phonetic_distance'] = df.apply(lambda row: phonetic_matching_str(str_1=row['str_1'],
                                                                    str_2=row['str_2']), axis=1)

In [32]:
df.head()

Unnamed: 0,str_1,str_2,is_correct,fuzzy_similarity,DL_similarity,jaccard_similarity,ngram_compare,phonetic_distance
0,#THROWBACKTHURSDAY,THROWBACKTHURSDAY VIDEOS,1,0.85,0.73913,0.5,0.607143,0.909091
1,10 THINGS HATE ABOUT YO,10 THINGS HATE ABOUT YOU,1,0.98,0.956522,0.666667,0.821429,1.0
2,10 THINGS HATE ABOUT YOU,10 THINGS HATE ABOUT YOU,1,1.0,1.0,1.0,1.0,1.0
3,10 THINGS HATE ABOUT YOU,WILD THINGS,0,0.36,0.173913,0.166667,0.147059,0.576984
4,100 GREATEST KIDS STARS,THE GREATEST,0,0.55,0.428571,0.2,0.21875,0.619048


In [34]:
# This should be the whole data set that we need for classification.
df.to_csv('full_matching_data_set.csv', index=False, encoding='utf8')