In [105]:
! pip install fuzzywuzzy
! pip install python-Levenshtein

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [120]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from scipy import spatial
from fuzzywuzzy import fuzz
from sklearn.decomposition import TruncatedSVD
import gensim

# <font color='red'>Manual / Statistical Thresholding</font>
Set a threshold on any distance or metric and if two docs more/less than that metric, those are similar. Threshold depends on the data distribution

In [86]:
d1 = "Music is a universal language"
d2  =  "Music is a miracle"
d3  =  "Music is a universal feature of the human experience"
d = [d1,d2,d3]

## Hashing: Near Duplicate Detection
1. Build `Min / Sim ` Hashes of documents on char / word level 
2. Find the Hamming Distance between any 2

## [Fuzzy Logic](https://hub.packtpub.com/use-tensorflow-and-nlp-to-detect-duplicate-quora-questions-tutorial/)

In [110]:
print(fuzz.QRatio(d1,d2))
print(fuzz.QRatio(d2,d3))

print('-'*50)

print(fuzz.partial_ratio(d1,d2))
print(fuzz.partial_ratio(d2,d3))

print('-'*50)

print(fuzz.WRatio(d1,d2))
print(fuzz.WRatio(d2,d3))

print('-'*50)

print(fuzz.partial_token_set_ratio(d1,d2))
print(fuzz.partial_token_set_ratio(d2,d3))

print('-'*50)

print(fuzz.partial_token_sort_ratio(d1,d2))
print(fuzz.partial_token_sort_ratio(d2,d3))

print('-'*50)

print(fuzz.token_set_ratio(d1,d2))
print(fuzz.token_set_ratio(d2,d3))

print('-'*50)

print(fuzz.token_sort_ratio(d1,d2))
print(fuzz.token_sort_ratio(d2,d3))

68
46
--------------------------------------------------
72
72
--------------------------------------------------
86
86
--------------------------------------------------
100
100
--------------------------------------------------
67
56
--------------------------------------------------
71
71
--------------------------------------------------
55
34


## Jaccard Similarity

In [103]:
def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

print(get_jaccard_sim(d1,d2))
print(get_jaccard_sim(d2,d3))

0.5
0.3


## TfIdf + Distance Metric

In [98]:
fitted = TfidfVectorizer().fit(d)
transformed = fitted.transform(d)

print(fitted.get_feature_names())

a1 = transformed.toarray()[0]
a2 = transformed.toarray()[1]
a3 = transformed.toarray()[2]

['experience', 'feature', 'human', 'is', 'language', 'miracle', 'music', 'of', 'the', 'universal']


In [104]:
print(1-spatial.distance.cosine(a1,a2))
print(1-spatial.distance.cosine(a2,a3))

0.3549151836521609
0.21373347085583771


## TfIdf + SVD + Distance
### [Variation](https://hub.packtpub.com/use-tensorflow-and-nlp-to-detect-duplicate-quora-questions-tutorial/)

In [119]:
d1 = d.copy()

tfv_q1 = TfidfVectorizer()
tfv_q2 = TfidfVectorizer() # Both must be same

q1_tfidf = tfv_q1.fit_transform(d) # d is a collection of docs
q2_tfidf = tfv_q2.fit_transform(d1) # d is a collection of other doc

svd_q1 = TruncatedSVD(n_components=2)
svd_q2 = TruncatedSVD(n_components=2)

question1_vectors = svd_q1.fit_transform(q1_tfidf)
question2_vectors2 = svd_q2.fit_transform(q2_tfidf)

# Get Similarity of Cosine Dist Here

## Embeddings + Distance
1. Get Vectors for each Word
2. Average / Concat / Median all the Embedding of each word in Sentence / Document
3. Calculate the Distance between 2

**Distances**
1. Word mover distance
2. Normalized word mover distance
3. Cosine distance between vectors of question1 and question2
4. Manhattan distance between vectors of question1 and question2
5. Jaccard similarity between vectors of question1 and question2
6. Canberra distance between vectors of question1 and question2
7. Euclidean distance between vectors of question1 and question2
8. Minkowski distance between vectors of question1 and question2
9. Braycurtis distance between vectors of question1 and question2
10. The skew of the vector for question1
11. The skew of the vector for question2
12. The kurtosis of the vector for question1
13. The kurtosis of the vector for question2

In [124]:
# model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

## Embeddings + TfIdf
1. Get Vectors for each Word
2. Multiply by the `idf` of each word
3. Average / Concat / Median all the Embedding of each word in Sentence / Document
4. Calculate the Distance between 2

# <font color='red'>Superwised / AI / ML</font>

## ML Model
1. Get Any / All of the features described in the methods above
2. Pass the features to any Classification Model with `0/1` for `Not Similar / Similar` Binary Classification

## Siamese Model -> Embedding -> Distance

1. Build a Model with 2 or 3 inputs. Any of `RNN / LSTM / GRU / Transformer / BERT`
2. Train Model on `Contrastive / Triplet Loss`
3. Extract Embedding for each Sentence
4. Calculate distance between 2

# <font color='red'>Unsuperwised</font>

## Tfidf + Clustering

## ANY Embeddings + Clustering

## LDA / LSA
[1](https://towardsdatascience.com/latent-dirichlet-allocation-lda-9d1cd064ffa2)
[2](https://www.mygreatlearning.com/blog/understanding-latent-dirichlet-allocation/)
[3]()