In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import numpy as np
import string
from bs4 import BeautifulSoup
import requests
import re

from wordcloud import STOPWORDS
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

import gensim
from gensim.matutils import softcossim
from gensim import corpora
import gensim.downloader as api



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\edouc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\edouc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Cosinus Similarity
Cosinus Similarity, incelenen dökümanlardaki kelime sayılarına göre oluşturulmuş vektörlerin 3 boyutlu uzaydaki konumuna göre, aralarındaki açının kosinüs değerini temel alarak benzerliklerini hesaplamaya dayanır. Kelimelerin anlamlarına odaklanmaz. İki belgenin vektörleri arasındaki açı azaldıkça benzerlikleri artar. Bu uygulamayla 'computer network', 'communication protocol', 'telecommunications network', 'network topology', 'transmission medium' konulu 5 makalenin cosine similarity'lerini inceleyeceğiz.

In [2]:
# Links of pages whose content will be compared
urls = [
    'https://en.wikipedia.org/wiki/Computer_simulation',
    'https://en.wikipedia.org/wiki/Agent-based_model',
    'https://en.wikipedia.org/wiki/Computational_model',
]

In [5]:
document = []

#extracting data from these pages
txt = ''
for link in urls:
    response = requests.get(link)
    content = response.content
    soup = BeautifulSoup(response.content, 'html.parser')
    for t in soup.find_all('p'):
        txt += t.get_text()
    document.append(txt)

In [6]:
#clearing the texts
i = 0
for text in document:

    text = text.encode("ascii", errors="ignore").decode()

    text = text.lower()

    for character in text:
        if character in string.punctuation:
            text = text.replace(character, "")
            
    pattern = r'[0-9]'
    new_text = re.sub(pattern, '', text)

    lemmatizer = WordNetLemmatizer()
    text_tokens = word_tokenize(new_text)
    tokens_without_sw = [word for word in text_tokens if word not in STOPWORDS]

    lemmatized_words = ""
    for item in tokens_without_sw:
        lemmatized_words += lemmatizer.lemmatize(item) + " "
    new_text = lemmatized_words
    document[i] = new_text
    i += 1

Aşağıdaki tabloda makaleler içerisinde geçen her bir kelimenin hangi makalede kaç kere geçtiğini görüyoruz.

In [7]:
count_vectorizer = CountVectorizer(stop_words='english')

sparse_matrix = count_vectorizer.fit_transform(document)

# word frequencies matrix of the documents
doc_term_matrix = sparse_matrix.todense()

df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(),
                  index=['Computer_simulation', 'Agent-based_model', 'Computational_model'])

df

Unnamed: 0,ability,able,abm,ac,academic,accelerate,accept,access,according,accounted,...,winner,wire,wireless,word,work,workforce,working,world,year,yield
Computer_simulation,1,2,0,1,0,1,1,0,1,1,...,0,1,0,0,0,0,0,3,1,0
Agent-based_model,3,2,35,1,1,1,1,1,1,1,...,1,1,1,3,3,1,1,7,2,1
Computational_model,3,2,35,1,1,1,1,1,1,1,...,1,1,1,3,3,1,1,7,2,1


Bu makalelerin cosine similarity hesaplamasını aşağıda yapıyoruz. Ve bu hesaplama sonucunda en büyük benzerliğin computer architecture ve computer vision arasında olduğunu gözlemliyoruz. Bu makaleler birbirine benzer konulardan oluştuğu için benzerlik oranlarının da hemen hemen aynı ve yüksek olduğunu söyleyebiliriz.

In [8]:
# Compute Cosine Similarity

cossim_matrix = cosine_similarity(df, df)
names=['Computer_simulation', 'Agent-based_model', 'Computational_model']

df2 = pd.DataFrame(cossim_matrix, columns=names, index=names)
df2

Unnamed: 0,Computer_simulation,Agent-based_model,Computational_model
Computer_simulation,1.0,0.855621,0.856777
Agent-based_model,0.855621,1.0,0.998965
Computational_model,0.856777,0.998965,1.0


Aşağıda gördüğümüz tablo ise aynı hesaplamanın tf-idf yöntemi ile yapılmasına dayanarak oluşturulur. Buradaki tf "term-frequency" anlamına gelir ve kelimelerin tekrar etme durumuna bağlıdır. Bununla beraber bakılan idf değeri ise "inverse document frequency" değeridir ve bütün dökümanlara odaklanır. Bir kelime diğer dökümanlarda ne kadar çok tekrarlanıyorsa idf değeri o kadar azalır. Bütün olarak tf-idf değeri bir kelimenin belgeyle ne kadar alakalı olduğunu tespit eden bir ölçüttür. Konu ile alakası olmayan ama belge içerisinde sık sık tekrar eden sözcükleri tespit etmede kullanılabilir.

*tf-idf puanı ne kadar yüksek olursa kelime o kadar belgelerle alakalıdır.*

In [9]:
tfidf_vectorizer = TfidfVectorizer()
sparse_matrix = tfidf_vectorizer.fit_transform(document)


doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=tfidf_vectorizer.get_feature_names(), 
                  index=['Computer_simulation', 'Agent-based_model', 'Computational_model'])
df

Unnamed: 0,ability,able,abm,ac,academic,accelerate,accept,access,according,accounted,...,wire,wireless,within,word,work,workforce,working,world,year,yield
Computer_simulation,0.008104,0.016208,0.0,0.008104,0.0,0.008104,0.008104,0.0,0.008104,0.008104,...,0.008104,0.0,0.008104,0.0,0.0,0.0,0.0,0.024312,0.008104,0.0
Agent-based_model,0.012058,0.008038,0.181143,0.004019,0.005176,0.004019,0.004019,0.005176,0.004019,0.004019,...,0.004019,0.005176,0.020096,0.015527,0.015527,0.005176,0.005176,0.028135,0.008038,0.005176
Computational_model,0.011664,0.007776,0.17523,0.003888,0.005007,0.003888,0.003888,0.005007,0.003888,0.003888,...,0.003888,0.005007,0.01944,0.01502,0.01502,0.005007,0.005007,0.027216,0.007776,0.005007


## Birbirinden farklı ama yine bilgisayar ile alakalı konuları karşılaştırırsak;

In [10]:
urls2 = [
    'https://en.wikipedia.org/wiki/Computer_simulation',
    'https://en.wikipedia.org/wiki/Computer_network',
    'https://en.wikipedia.org/wiki/Artificial_intelligence',
]

document2 = []

#extracting data from these pages
txt = ''
for link in urls2:
    response = requests.get(link)
    content = response.content
    soup = BeautifulSoup(response.content, 'html.parser')
    for t in soup.find_all('p'):
        txt += t.get_text()
    document2.append(txt)

#clearing the texts
i = 0
for text in document2:

    text = text.encode("ascii", errors="ignore").decode()

    text = text.lower()

    for character in text:
        if character in string.punctuation:
            text = text.replace(character, "")
            
    pattern = r'[0-9]'
    new_text = re.sub(pattern, '', text)

    lemmatizer = WordNetLemmatizer()
    text_tokens = word_tokenize(new_text)
    tokens_without_sw = [word for word in text_tokens if word not in STOPWORDS]

    lemmatized_words = ""
    for item in tokens_without_sw:
        lemmatized_words += lemmatizer.lemmatize(item) + " "
    new_text = lemmatized_words
    document2[i] = new_text
    i += 1

In [11]:
count_vectorizer = CountVectorizer(stop_words='english')

sparse_matrix = count_vectorizer.fit_transform(document2)

# word frequencies matrix of the documents
doc_term_matrix = sparse_matrix.todense()

df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(),
                  index=['Computer_simulation', 'Computer_networks', 'Artificial_intelligence'])
df

Unnamed: 0,aaai,abandoned,ability,able,ac,academia,academic,accelerate,accelerating,accept,...,wrote,yahoo,yang,year,young,youtube,yuval,zdnet,zrtp,zuckerberg
Computer_simulation,0,0,1,2,1,0,0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
Computer_networks,0,0,5,2,1,0,3,1,0,1,...,0,1,0,1,0,0,0,0,1,0
Artificial_intelligence,2,2,14,8,1,1,6,1,1,1,...,1,1,1,9,1,1,1,1,1,1


Benzerlik oranlarının önceki karşılaştırdığımız makalelere göre çok daha düşük olduğunu görebiliriz. Ancak bu fark anlamsal farklardan değil tekrar eden ortak kelimelerin azalmasından kaynaklanmaktadır.

In [12]:
cossim_matrix = cosine_similarity(df, df)
names=['Computer_simulation', 'Computer_networks', 'Artificial_intelligence']

df2 = pd.DataFrame(cossim_matrix, columns=names, index=names)
df2

Unnamed: 0,Computer_simulation,Computer_networks,Artificial_intelligence
Computer_simulation,1.0,0.501937,0.473006
Computer_networks,0.501937,1.0,0.874097
Artificial_intelligence,0.473006,0.874097,1.0


# Soft Cosine Similarity

Soft cosinus similarity, cosinus similarity'den farklı olarak kelimelerin anlamlarını da dikkate alır. Bu sayede birbiri ile benzer konulardaki  dökümanları karşılaştırırken daha yüksek benzerlik oranları, birbiri ile alakasız dökümanları karşılaştırırken daha düşük benzerlik oranları elde edebiliriz. 


Ama öncelikle bu çalışmaya başlamadan önce, kelimelerin vektörlerini elde edebilmek için, çalışmalarımı yaparken faydalandığım internet sitesinde önerilen "Fast-text" word embedding modelini indireceğim.

In [13]:
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')

In [15]:
text1 = document2[0].split()
text2 = document2[1].split()
text3 = document2[2].split()

# Prepare a dictionary and a corpus.
documents = [text1, text2, text3]
dictionary = corpora.Dictionary(documents)

# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary)

# Convert the sentences into bag-of-words vectors.
text1 = dictionary.doc2bow(text1)
text2 = dictionary.doc2bow(text2)
text3 = dictionary.doc2bow(text3)

Computer simulation, computer networks ve artificial intelligence makalelerinin, sof cosine similarity ile karşılaştırıldığında anlamsal olarak benzerlikler dikkate alındığı için benzerlik oranlarının cosine similarity'ye göre çok daha yüksek olduğunu görebiliyoruz.

In [16]:
sentences = [text1, text2, text3]

len_array = np.arange(len(sentences))
xx, yy = np.meshgrid(len_array, len_array)
cossim_mat = pd.DataFrame([[round(softcossim(sentences[i],sentences[j], similarity_matrix) ,2) for i, j in zip(x,y)] for y, x in zip(xx, yy)],
                          columns=names, index=names)

    
cossim_mat

Unnamed: 0,Computer_simulation,Computer_networks,Artificial_intelligence
Computer_simulation,1.0,0.64,0.66
Computer_networks,0.64,1.0,0.92
Artificial_intelligence,0.66,0.92,1.0


## Yararlandığım Kaynaklar;

https://www.machinelearningplus.com/nlp/cosine-similarity/


https://en.wikipedia.org/wiki/Computer_simulation
https://en.wikipedia.org/wiki/Agent-based_model
https://en.wikipedia.org/wiki/Computational_model
https://en.wikipedia.org/wiki/Computer_simulation
https://en.wikipedia.org/wiki/Computer_network
https://en.wikipedia.org/wiki/Artificial_intelligence