In [1]:
%matplotlib inline

# Chapter06. Text Similarity and Clustering

# Similarity Measure

참고 : 
- The paper by A. Huang, “Similarity Measures for Text Document Clustering,” 

Consider a distance measure *d* and two entities (say they are documents in our context) *x* and *y*.

<br/>

The distance between *x* and *y*, which is used to determine the degree of similarity between them, can be represented as *d*(*x*, *y*), but the measure *d* can be called as a *distance metric of similarity* if and only if it satisfies the following four conditions:

1. The distance measured between any two entities, say *x* and *y*, must be always non-negative, that is, ![$$ d\left(x,\ y\right)\ge 0 $$](https://learning.oreilly.com/library/view/text-analytics-with/9781484223871/A427287_1_En_6_Chapter_IEq1.gif).
2. The distance between two entities should always be zero if and only if they are both identical, that is, ![$$ d\left(x,\ y\right)\ge 0\ iff\ x=y $$](https://learning.oreilly.com/library/view/text-analytics-with/9781484223871/A427287_1_En_6_Chapter_IEq2.gif).
3. This distance measure should always be symmetric, which means that the distance from *x* to *y* is always the same as the distance from *y* to *x*. Mathematically this is represented as ![$$ d\left(x,\ y\right) = d\left(y,\ x\right) $$](https://learning.oreilly.com/library/view/text-analytics-with/9781484223871/A427287_1_En_6_Chapter_IEq3.gif).
4. This distance measure should satisfy the *triangle inequality* property, which can be mathematically represented ![$$ d\left(x,\ z\right)\le\ d\left(x,\ y\right) + d\left(y,\ z\right) $$](https://learning.oreilly.com/library/view/text-analytics-with/9781484223871/A427287_1_En_6_Chapter_IEq4.gif).

# Text Normalization

In [1]:
from contractions import CONTRACTION_MAP
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
from html.parser import HTMLParser
import unicodedata

In [2]:
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list = stopword_list + ['mr', 'mrs', 'come', 'go', 'get', 'tell', 'listen', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'zero', 'join', 'find', 'make', 'say', 'ask', 'tell', 'see', 'try', 'back', 'also']

**주의**: HTMLParser import 가 안되는 경우 **normalization.py** 파일을 아래와 같이 변경해 준다.

from html.parser import HTMLParser

In [3]:
html_parser = HTMLParser()

In [4]:
from normalization import tokenize_text, expand_contractions, lemmatize_text, remove_special_characters, remove_stopwords

In [5]:
# from normalization import keep_text_characters

import re

def keep_text_characters(text):
    filtered_tokens = []
    tokens = tokenize_text(text)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [10]:
# movie_data.csv: The Godfather
leo_text = '''
In late summer 1945, guests are gathered for the wedding reception of Don Vito Corleone's daughter Connie (Talia Shire) and Carlo Rizzi (Gianni Russo). Vito (Marlon Brando), the head of the Corleone Mafia family, is known to friends and associates as ""Godfather."" He and Tom Hagen (Robert Duvall), the Corleone family lawyer, are hearing requests for favors because, according to Italian tradition, ""no Sicilian can refuse a request on his daughter's wedding day.
'''
text = keep_text_characters(leo_text);
text

"In late summer guests are gathered for the wedding reception of Don Vito Corleone 's daughter Connie Talia Shire and Carlo Rizzi Gianni Russo Vito Marlon Brando the head of the Corleone Mafia family is known to friends and associates as Godfather He and Tom Hagen Robert Duvall the Corleone family lawyer are hearing requests for favors because according to Italian tradition no Sicilian can refuse a request on his daughter 's wedding day"

In [11]:
# from normalization import normalize_corpus
def normalize_corpus(corpus, lemmatize=True,
                     only_text_chars=False,
                     tokenize=False):

    normalized_corpus = []    
    for text in corpus:
        text = html_parser.unescape(text)
        text = expand_contractions(text, CONTRACTION_MAP)
        if lemmatize:                                                                      
            text = lemmatize_text(text)
        else:
            text = text.lower()
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        if only_text_chars:
            text = keep_text_characters(text)

        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
        else:
            normalized_corpus.append(text)

    return normalized_corpus                                          

In [12]:
# toy_corpus from document_similarity.py
toy_corpus = [
'The sky is blue',
'The sky is blue and beautiful',
'Look at the bright blue sky!',
'Python is a great Programming language',
'Python and Java are popular Programming languages',
'Among Programming languages, both Python and Java are the most used in Analytics',
'The fox is quicker than the lazy dog',
'The dog is smarter than the fox',
'The dog, fox and cat are good friends'
]

In [21]:
query_docs = ['The fox is definitely smarter than the dog',
            'Java is a static typed programming language unlike Python',
            'I love to relax under the beautiful blue sky!']  

In [13]:
norm_corpus = normalize_corpus(toy_corpus, tokenize=True)
norm_corpus

  


[['sky', 'blue'],
 ['sky', 'blue', 'beautiful'],
 ['look', 'bright', 'blue', 'sky'],
 ['python', 'great', 'programming', 'language'],
 ['python', 'java', 'popular', 'programming', 'language'],
 ['among', 'programming', 'language', 'python', 'java', 'use', 'analytics'],
 ['fox', 'quick', 'lazy', 'dog'],
 ['dog', 'smarter', 'fox'],
 ['dog', 'fox', 'cat', 'good', 'friend']]

In [14]:
leo_corpus = [leo_text]
norm_leo_corpus = normalize_corpus(leo_corpus, tokenize=True)
norm_leo_corpus

  


[['late',
  'summer',
  '1945',
  'guest',
  'gather',
  'wedding',
  'reception',
  'vito',
  'corleones',
  'daughter',
  'connie',
  'talia',
  'shire',
  'carlo',
  'rizzi',
  'gianni',
  'russo',
  'vito',
  'marlon',
  'brando',
  'head',
  'corleone',
  'mafia',
  'family',
  'know',
  'friend',
  'associate',
  'godfather',
  'tom',
  'hagen',
  'robert',
  'duvall',
  'corleone',
  'family',
  'lawyer',
  'hearing',
  'request',
  'favor',
  'accord',
  'italian',
  'tradition',
  'sicilian',
  'refuse',
  'request',
  'daughter',
  'wedding',
  'day']]

# Feature Extraction

feature types:
- Bag of Words frequency, occurrences,
- TF-IDF–based features.

parameters: 

- ngram_range : bigrams, trigrams, and so on.
- min_df : [0.0, 1.0], lower bound document frequency threshold value.
- max_df : [0.0, 1.0], upper bound document frequency threshold value.

In [16]:
# from utils import build_feature_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def build_feature_matrix(documents, feature_type='frequency', ngram_range=(1, 1), min_df=0.0, max_df=1.0):

    feature_type = feature_type.lower().strip()  

    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=min_df, max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df, max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, ngram_range=ngram_range)
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency','tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)

    return vectorizer, feature_matrix

In [18]:
norm_corpus = normalize_corpus(toy_corpus, lemmatize=True)

vectorizer, corpus_features = build_feature_matrix(norm_corpus, feature_type='frequency')

  


In [24]:
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.0,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [26]:
corpus_features

<9x22 sparse matrix of type '<class 'numpy.float64'>'
	with 37 stored elements in Compressed Sparse Row format>

In [27]:
norm_query_docs =  normalize_corpus(query_docs, lemmatize=True)  

query_docs_features = vectorizer.transform(norm_query_docs)

  


In [28]:
query_docs_features

<3x22 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [29]:
for index, doc in enumerate(query_docs):
    doc_features = query_docs_features[index]
    print(doc_features)

  (0, 6)	1
  (0, 7)	1
  (0, 20)	1
  (0, 11)	1
  (0, 12)	1
  (0, 16)	1
  (0, 17)	1
  (0, 2)	1
  (0, 3)	1
  (0, 19)	1
