# Implementing lesk algorithm from scratch using string similarity and text vectorization

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
import numpy as np

In [None]:
def get_tf_idf_vectors(corpus):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_results = tfidf_vectorizer.fit_transform(corpus).todense()
    return tfidf_results

In [None]:
def to_lower_case(corpus):
    lowercase_corpus = [x.lower() for x in corpus]
    return lowercase_corpus

In [None]:
def find_sentence_defnition(sent_vector,defnition_vectors):
    """
    This method will find cosine similarity of sentence with
    the possible definitions and return the one with highest similarity score
    along with the similarity score.
    """

    result_dict = {}
    for defnition_id,def_vector in defnition_vectors.items():
        sim = cosine_similarity(sent_vector,def_vector)
        result_dict[defnition_id] = sim[0][0]
    defnition  = sorted(result_dict.items(), key=lambda x: x[1], reverse=True)[0]
    return defnition[0],defnition[1]

In [None]:
corpus = ["On the banks of river Ganga, there lies the scent of spirituality",
          "An institute where people can store extra cash or money.",
          "The land alongside or sloping down to a river or lake"
           "What you do defines you",
           "Your deeds define you",
           "Once upon a time there lived a king.",
           "Who is your queen?",
            "He is desperate",
           "Is he not desperate?"]

In [None]:
lower_case_corpus  = to_lower_case(corpus)
corpus_tf_idf  = get_tf_idf_vectors(lower_case_corpus)
sent_vector = corpus_tf_idf[0]
defnition_vectors = {'def1':corpus_tf_idf[1],'def2':corpus_tf_idf[2]}
defnition_id, score  = find_sentence_defnition(sent_vector,defnition_vectors)
print("The defnition of word {} is {} with similarity of {}".format('bank',defnition_id,score))