# Presentation - Text Similarity (German texts)

### Course in Content Management, Text and Search Technologies

In [1]:
import itertools
import re
import heapq
import pandas as pd
import nltk
import spacy
from lxml.html import html_parser
from spacy.lang.de import German
import string
import numpy as np
from nltk.corpus import stopwords
from spacy.lemmatizer import Lemmatizer
stopwords = stopwords.words("german")
stopwords.append("dass")

nlp = spacy.load('de_core_news_sm')

### Create a little corpus

In [3]:
CORPUS = [
"Das rote Auto hält an der roten Ampel. Das rote Auto fährt zu schnell.", 
"Das schwarze Auto wurde rot eingefärbt, weil die Farbe rot lebendiger wirkt",
"Das schwarze Mofa fährt an der grünen Ampel durch, hält aber am roten Schild nicht und wird vom roten Auto angefahren",
"Morgen scheint die Sonne und wir fahren ans Meer"
]

CORPUS = [text.lower() for text in CORPUS]
CORPUS

['das rote auto hält an der roten ampel. das rote auto fährt zu schnell.',
 'das schwarze auto wurde rot eingefärbt, weil die farbe rot lebendiger wirkt',
 'das schwarze mofa fährt an der grünen ampel durch, hält aber am roten schild nicht und wird vom roten auto angefahren',
 'morgen scheint die sonne und wir fahren ans meer']

In [4]:
satz = "Das rote Auto hält an der roten Ampel. Das rote, schwarze Auto fährt zu schnell. Das schwarze Mofa ist langsam"
satz = satz.lower()
sent = nlp(satz)
lemmas = [word.lemma_ for word in sent if not word.is_punct]
lemmas

['der',
 'rote',
 'auto',
 'halten',
 'an',
 'der',
 'rot',
 'ampel',
 'der',
 'rote',
 'schwarze',
 'auto',
 'fahren',
 'zu',
 'schnellen',
 'der',
 'schwarze',
 'mofa',
 'sein',
 'langsam']

Lemmatizing failed for following adjectives/adverbs:

- rote 
- schwarze 

Right forms should be: "rot" and "schwarz".

In [5]:
corpus = CORPUS.copy()

## CountVectorizer with Tokens

#### Punctuation is removed from text (string.punctuation is used)

In [6]:
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

In [7]:
def get_tokens(text):
    doc = nlp(text)
    return [token.lemma_.lower().translate(remove_punct_dict) for token in doc if not token.is_punct or not token.like_num]

### Word frequency with CountVectorizer

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
start = 1
stop = 1

In [10]:
token_vectorizer = CountVectorizer(tokenizer=get_tokens, stop_words=stopwords, ngram_range=(start,stop))

In [11]:
token_vectorizer.fit_transform(CORPUS)

<4x22 sparse matrix of type '<class 'numpy.int64'>'
	with 33 stored elements in Compressed Sparse Row format>

In [12]:
print(token_vectorizer.vocabulary_)
print("\nAnzahl der N-Gramme: ", len(token_vectorizer.vocabulary_))

{'rote': 15, 'auto': 4, 'halten': 9, 'rot': 14, 'ampel': 1, '': 0, 'fahren': 6, 'schnellen': 18, 'schwarze': 19, 'einfärben': 5, 'farbe': 7, 'lebendig': 10, 'wirken': 21, 'mofa': 12, 'grün': 8, 'schild': 17, 'anfahren': 2, 'morgen': 13, 'scheinen': 16, 'sonne': 20, 'ans': 3, 'meer': 11}

Anzahl der N-Gramme:  22


## Term Frequency Matrix

In [13]:
tf_matrix = token_vectorizer.transform(CORPUS).toarray()

In [14]:
print(tf_matrix)

[[2 1 0 0 2 0 1 0 0 1 0 0 0 0 1 2 0 0 1 0 0 0]
 [1 0 0 0 1 1 0 1 0 0 1 0 0 0 2 0 0 0 0 1 0 1]
 [1 1 1 0 1 0 1 0 1 1 0 0 1 0 2 0 0 1 0 1 0 0]
 [0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0]]


In [15]:
tf_matrix.shape #(3 = Dokumente, 43 = Wörter)

(4, 22)

## Compute IDF and transform results to a TF-IDF Matrix

The result is a vector, whose components represent the IDF value for each word in the dictionary.

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidfTran = TfidfTransformer(norm="l2")
tfidfTran.fit(tf_matrix)
print("IDF- Score für die Wörter im Corpus: ´\n", tfidfTran.idf_)

IDF- Score für die Wörter im Corpus: ´
 [1.22314355 1.51082562 1.91629073 1.91629073 1.22314355 1.91629073
 1.22314355 1.91629073 1.91629073 1.51082562 1.91629073 1.91629073
 1.91629073 1.91629073 1.22314355 1.91629073 1.91629073 1.91629073
 1.91629073 1.51082562 1.91629073 1.91629073]


### Results test

In [17]:
import math
#Credits: https://sites.temple.edu/tudsc/2017/03/30/measuring-similarity-between-texts-in-python/
def idf(n,df):
    result = math.log((n+1.0)/(df+1.0)) + 1
    return result

anzahl_doc = 3
print ("IDF-Score für Wörter, die nur in 1 Dokument vorkommen: " + str(idf(anzahl_doc,1)))#1x 
print ("IDF-Score für Wörter, die in 2 Dokumenten vorkommen: " + str(idf(anzahl_doc,2)))#2x

IDF-Score für Wörter, die nur in 1 Dokument vorkommen: 1.6931471805599454
IDF-Score für Wörter, die in 2 Dokumenten vorkommen: 1.2876820724517808


### TF-IDF Matrix

Creation of a TF-IDF matrix. 

In [18]:
tfidf_matrix = tfidfTran.transform(tf_matrix)
#The transform()-method multiplies the TF-matrix from above by the diagonal IDF-Matrix. 
#Then it divides the TF-IDF score by the euclidean norm.
print (tfidf_matrix.toarray())

[[0.39743237 0.24545402 0.         0.         0.39743237 0.
  0.19871618 0.         0.         0.24545402 0.         0.
  0.         0.         0.19871618 0.6226546  0.         0.
  0.3113273  0.         0.         0.        ]
 [0.24011956 0.         0.         0.         0.24011956 0.37619368
  0.         0.37619368 0.         0.         0.37619368 0.
  0.         0.         0.48023911 0.         0.         0.
  0.         0.29659542 0.         0.37619368]
 [0.21619279 0.26704111 0.33870779 0.         0.21619279 0.
  0.21619279 0.         0.33870779 0.26704111 0.         0.
  0.33870779 0.         0.43238559 0.         0.         0.33870779
  0.         0.26704111 0.         0.        ]
 [0.         0.         0.         0.43003652 0.         0.
  0.27448674 0.         0.         0.         0.         0.43003652
  0.         0.43003652 0.         0.         0.43003652 0.
  0.         0.         0.43003652 0.        ]]


## Similarity

The result matrix from the computation above is then multiplied by its transpose matrix. The result is a matrix containing distance values (which should represent the "similarity" among pair of terms in the text). As a result the first 3 documents are similar to each other.

In [19]:
cos_similarity_matrix = (tfidf_matrix * tfidf_matrix.T).toarray()

In [20]:
print(cos_similarity_matrix)

[[1.         0.28629385 0.43181968 0.05454496]
 [0.28629385 1.         0.39067588 0.        ]
 [0.43181968 0.39067588 1.         0.05934205]
 [0.05454496 0.         0.05934205 1.        ]]


### Cosine-Similarity with sklearn

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

A_sparse = sparse.csr_matrix(tfidf_matrix)

In [23]:
#print(A_sparse)

similarities = cosine_similarity(A_sparse)
print(similarities)

[[1.         0.28629385 0.43181968 0.05454496]
 [0.28629385 1.         0.39067588 0.        ]
 [0.43181968 0.39067588 1.         0.05934205]
 [0.05454496 0.         0.05934205 1.        ]]


## TF-IDF-Vectorizer


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
TfidfVec = TfidfVectorizer(tokenizer=get_tokens, stop_words=stopwords, ngram_range=(1,2))


In [25]:
def cos_similarity(textlist):
    tfidf = TfidfVec.fit_transform(textlist)
    return (tfidf * tfidf.T).toarray()

In [26]:
cos_similarity(CORPUS)

array([[1.        , 0.13562957, 0.26402917, 0.02719766],
       [0.13562957, 1.        , 0.18141068, 0.        ],
       [0.26402917, 0.18141068, 1.        , 0.02900303],
       [0.02719766, 0.        , 0.02900303, 1.        ]])

### Sources / Inspiration: 
https://sites.temple.edu/tudsc/2017/03/30/measuring-similarity-between-texts-in-python/

http://blog.christianperone.com/2011/10/machine-learning-text-feature-extraction-tf-idf-part-ii/