<b>Count Vectorizer, Tfidf Vectorizer</b>

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import pandas as pd

def getDocumentList(fileName):
    data = pd.read_csv(fileName, sep=";", encoding="utf8")
    documents = data['icerik'].tolist()
    doc_lst = [] 
    for doc in documents:
        doc_lst.append(doc.split())
    return doc_lst

def train_word2vec(p_documents, p_num_hidden_layers, p_window, p_min_count, p_workers, p_save_model=False):
       
    w2v_model = Word2Vec(p_documents, size=p_num_hidden_layers, window=p_window, min_count=p_min_count, workers=p_workers)
    if p_save_model:
        w2v_model.wv.save_word2vec_format(fname="D:/dergipark_word2vec.model", fvocab="D:/dergipark_word2vec.vocab")

    print("Sözlükteki kelime sayısı: " + str(len(w2v_model.wv.vocab)))
    vocabulary_tuple = w2v_model.vocab.items() 
    vocabulary = list()
    for (a, _) in vocabulary_tuple:
        vocabulary.append(a)
        
    word_vectors = [] 
    for word in vocabulary: 
        word_vectors.append(w2v_model[word])
        
    return (word_vectors, vocabulary, w2v_model)

In [None]:
documents = getDocumentList("C://icerik_dergipark.csv")
word_vectors, vocabulary, w2v_model = train_word2vec(documents, 300, 8, 5, 4)
print(w2v_model.most_similar(positive=['deep', 'learning'], topn=5))

In [None]:
#loading the pre-saved model
#from gensim.models import KeyedVectors
#model = KeyedVectors.load_word2vec_format("D:/dergipark_word2vec.model")

In [None]:
def extract_keywords(documents, low, high):
    """
        Keyword extractin based on document frequency values
        Keywords are words with frequency values between low and high  
    """
    (a, keywords) = tf_idf_vectorizer(documents, high, low, (0,1)) 
    print "The number of keywords are ", len(keywords)
    return keywords

def enrich_text(documents, w2v_train_set, keywords,  n_related_terms, w2v_trained):
    
    """
       Expanding text with related words in order to explicitly separate it 
       from other documents.
       if text contains a sentence with keyword then it will be expanded 
       by adding related words to that keyword to the sentence 
    """
    
    if w2v_trained == None: 
        (vectors, vocab, model) = train_word2vec(w2v_train_set, 300, 6, 1)
    else: 
        model = w2v_trained 
        
    # Getting related terms to the keywords 
    related_terms = []
    related_terms_dict = {}
    expanded_keywords = keywords 
    
    for item in expanded_keywords:
        a = getSimilarTerms(item, n_related_terms,  model)
        for i in a:
            string = ""
            try: 
                string = string + " " + i
            except UnicodeDecodeError:
                print "cannot decode the bytsstring %s", i
        if string != "":
            related_terms.append(string)
            related_terms_dict[item] = string
    
    documents_split = []
    for doc in documents:
    	documents_split.append(doc.split()) 
    
    # text enrichment  
    for doc in documents_split: 
        for i in range(len(doc)): 
            if doc[i] in expanded_keywords:
                doc[i] = related_terms_dict[doc[i]]

    final = [] 
    for i in documents_split: 
        final.append(" ".join(i))
        
    return (final, related_terms)


def getSimilarTerms(term, N,  model):
    a = []  
    try: 
        a = model.most_similar(term, topn=N)
    except: 
        print "The word ", term, " does not exists!"
    result = [term] * N  

    # print a
    K = N 
    for (i, _) in a:
        K = K -1
        result = result + [i] * K 
    return result 


## LEMMATIZATION 
def lemmatize(doc):
    """
    Lemmatization
    """
    lemm = {} 
    with open('lem.txt') as f:
        for line in f:
            a =  re.sub(' +',' ',line).split()
            lemm[a[1]] = a[0]  
    error = 0
    success = 0
    docl = []
    for i in doc.split():
        try:
            success += 1
            docl.append(lemm[i])
        except KeyError:
            error += 1
            docl.append(i)
    return " ".join(docl),success, error



""" 
USAGE: 
model =Word2Vec.load('model')
query = "the query"
documents = readFile_(file_name) 
num_results = 200
tfidf_search_result = tfidf_search(query, documents, num_results)
query_expansion_search = query_expansion_search(query, documents, num_results, isEnriched, w2vmodel):
"""

<b>Similarity of Documents By WordEmbeddings</b>

Pretrained model is loaded. Then the cosine similarity matrices are calculated on the model. Then the query is converted to the bag-of-words representation. Finally the similarity of the query to each document are calculated.

In [None]:
from gensim.corpora import Dictionary
from gensim.models import KeyedVectors
from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
import pandas as pd

data = pd.read_csv("C://icerik_dergipark.csv", sep=";", encoding="utf8")
documents = data['icerik'].tolist()
doc_lst = [] 
for doc in documents:
    doc_lst.append(doc.split())


#model = Word2Vec(common_texts, size=20, min_count=1)  # train word-vectors

model = KeyedVectors.load_word2vec_format("D:/dergipark_word2vec.model")
print("1")
termsim_index = WordEmbeddingSimilarityIndex(model.wv)
print("2")
dictionary = Dictionary(doc_lst)
print("3")
bow_corpus = [dictionary.doc2bow(document) for document in doc_lst]
print("4")
similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)  # construct similarity matrix
print("5")
docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10)
print("6")

query = 'makine öğrenmesi deep learning expert finding'.split()  # make a query
sims = docsim_index[dictionary.doc2bow(query)]  # calculate similarity of query to each doc from bow_corpus

1
2




3
4
