<b>Count Vectorizer, Tfidf Vectorizer</b>

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import pandas as pd

def getDataframe(fileName):
    data = pd.read_csv((fileName), sep=";", encoding="utf8")
    data.loc[:,'puan'] = 0.0
    return data

def get_count_vector(p_documents, p_maxdf, p_mindf, p_ngram_range): 
    vectorizer = CountVectorizer(max_df=p_maxdf, min_df=p_mindf, ngram_range=p_ngram_range)
    count_vector = vectorizer.fit_transform(p_documents)
    return count_vector, vectorizer

def get_tfidf_vector(p_documents, p_maxdf, p_mindf, p_ngram_range): 
    vectorizer = TfidfVectorizer(max_df=p_maxdf, min_df=p_mindf, ngram_range=p_ngram_range)
    tfidf_vector = vectorizer.fit_transform(p_documents)
    return tfidf_vector, vectorizer

def similarityProcess(p_query, p_data, p_vector, p_vectorizer):
    query_vector = p_vectorizer.transform([p_query])
    similarity = cosine_similarity(query_vector, p_vector)
    similarity_array = similarity[0]
    for indice, row in p_data.iterrows():
        p_data.at[indice, 'puan'] = similarity_array[indice]
    p_data = p_data.sort_values(by = 'puan', ascending = False)
    print(p_data.head(10)[['sira', 'makale_no', 'puan']])
    return p_data[['sira', 'makale_no', 'puan']]

def getSimilarDocsByCountVectorizer(p_file_name, p_query, p_max_df, p_min_df, p_ngram_range):
    data = getDataframe(p_file_name)
    documents = data['icerik'].tolist()    
    vector, vectorizer = get_count_vector(documents, p_max_df, p_min_df, p_ngram_range)
    result_data = similarityProcess(p_query, data, vector, vectorizer)
    result_data.to_csv(p_file_name.replace(".csv", "_count_vec_result.csv"), index=False)

def getSimilarDocsByTfidfVectorizer(p_file_name, p_query, p_max_df, p_min_df, p_ngram_range):
    data = getDataframe(p_file_name)
    documents = data['icerik'].tolist()    
    vector, vectorizer = get_tfidf_vector(documents, p_max_df, p_min_df, p_ngram_range)
    result_data = similarityProcess(p_query, data, vector, vectorizer)
    result_data.to_csv(p_file_name.replace(".csv", "_tfidf_vec_result.csv"), index=False)

In [37]:
query = "makine öğrenmesi deep learning expert finding"
fileNameDergipark = "C://icerik_dergipark.csv"
fileNameSobiad = "C://icerik_sobiad.csv"
max_df = 1000
min_df = 5
ngram_range = (1, 2)
print("Sorgu: " + query)
print("En benzer dokümanlar (Dergipark - count vectorizer)")
getSimilarDocsByCountVectorizer(fileNameDergipark, query, max_df, min_df, ngram_range)
print("En benzer dokümanlar (Dergipark - tfidf vectorizer)")
getSimilarDocsByTfidfVectorizer(fileNameDergipark, query, max_df, min_df, ngram_range)
print("En benzer dokümanlar (Sobiad - count vectorizer)")
getSimilarDocsByCountVectorizer(fileNameSobiad, query, max_df, min_df, ngram_range)
print("En benzer dokümanlar (Sobiad - tfidf vectorizer)")
getSimilarDocsByTfidfVectorizer(fileNameSobiad, query, max_df, min_df, ngram_range)

Sorgu: makine öğrenmesi deep learning expert finding
En benzer dokümanlar (Dergipark - count vectorizer)
      sira  makale_no      puan
3745  3746       4021  0.396969
5111  5112       5428  0.384617
3381  3382       3649  0.299850
2598  2599       2840  0.254056
5992  5993       6323  0.222220
1714  1715       1923  0.193924
8738  8739       9135  0.182719
5749  5750       6076  0.164216
2001  2002       2220  0.161421
1013  1014       1193  0.151775
En benzer dokümanlar (Dergipark - tfidf vectorizer)
      sira  makale_no      puan
3745  3746       4021  0.384806
5111  5112       5428  0.373125
3381  3382       3649  0.285292
2598  2599       2840  0.237217
5992  5993       6323  0.224830
1714  1715       1923  0.204700
8738  8739       9135  0.157648
5749  5750       6076  0.145249
2001  2002       2220  0.131485
1013  1014       1193  0.130897
En benzer dokümanlar (Sobiad - count vectorizer)
        sira  makale_no      puan
37844  37845      37846  0.342758
1211    1212       121