In [None]:
doc = """Bias is an error introduced in the model due to the oversimplification of the algorithm used (does
not fit the data properly). It can lead to under-fitting.Low bias machine learning algorithms — Decision Trees, k-NN and SVM High bias machine learning algorithms — Linear Regression, Logistic Regression
Variance: Variance is error introduced in the model due to a too complex algorithm, it performs very well in the training set but poorly in the test set. It can lead to high sensitivity and overfitting.
Possible high variance – polynomial regression Normally, as you increase the complexity of your model, you will see a reduction in error due to lower bias in the model. However, this only happens until a particular point. As you continue to make your model
more complex, you end up over-fitting your model and hence your model will start suffering from high
variance. Bias-Variance trade-off: The goal of any supervised machine learning algorithm is to have low bias and low variance to achieve good prediction performance.
1. The k-nearest neighbor algorithm has low bias and high variance, but the trade-off can be changed by increasing the value of k which increases the number of neighbors that contribute to the prediction and in turn increases the bias of the model.
2. The support vector machine algorithm has low bias and high variance, but the trade-off can be changed by increasing the C parameter that influences the number of violations of the margin allowed in the training data which increases the bias but decreases the variance.
3. The decision tree has low bias and high variance, you can decrease the depth of the tree or use fewer attributes.
4. The linear regression has low variance and high bias, you can increase the number of features or use another regression that better fits the data.
There is no escaping the relationship between bias and variance in machine learning. Increasing the bias will decrease the variance. Increasing the variance will decrease bias."""

n_gram_range = (3, 3)


# Sentence Transformer

In [None]:
!pip install sentence-transformers --quiet


Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 3.5 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 11.6 MB/s 
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.9 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 32.6 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
stop_words = "english"
# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = count.get_feature_names_out()


In [None]:
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
top_n = 10
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

In [None]:
import numpy as np
import itertools

def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings,
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [None]:
def mmr(doc_embedding, word_embeddings, words, top_n, diversity):

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

In [None]:
mmr(doc_embedding,candidate_embeddings,candidates,top_n=5,diversity=0.2)

['high bias machine learning algorithms',
 'polynomial regression normally increase complexity',
 'algorithm performs training set poorly',
 'variance machine learning increasing bias',
 'overfitting possible high variance polynomial']

In [None]:
max_sum_sim(doc_embedding,candidate_embeddings,candidates,top_n=5,nr_candidates=10)

['bias machine learning algorithms linear',
 'variance polynomial regression normally increase',
 'start suffering high variance bias',
 'bias variance machine learning increasing',
 'algorithm performs training set poorly']

# By wordwise


In [None]:
!pip install wordwise --quiet
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg


Collecting wordwise
  Downloading wordwise-0.0.4-py3-none-any.whl (5.5 kB)
Collecting spacy>=2.3.2
  Downloading spacy-3.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[K     |████████████████████████████████| 6.0 MB 5.6 MB/s 
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 30.7 MB/s 
Collecting srsly<3.0.0,>=2.4.1
  Downloading srsly-2.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (451 kB)
[K     |████████████████████████████████| 451 kB 38.9 MB/s 
[?25hCollecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 34.7 MB/s 
Collecting thinc<8.1.0,>=8.0.12
  Downloading thinc-8.0.13-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (628 kB)
[K     |████████████████████████████████| 628 kB 41.9 MB/s 
Collecting spacy-loggers<2.0.0,>=

In [None]:
from wordwise import Extractor

extractor = Extractor(n_gram_range=(3,3),spacy_model="en_core_web_lg")
keywords = extractor.generate(doc,top_k=5)


Downloading:   0%|          | 0.00/555 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/253M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/505 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]



['possible high variance', 'good prediction performance']


In [None]:
print(keywords)

['possible high variance', 'good prediction performance']


# Spacy with pytext rank

In [None]:
!pip install spacy-transformers --quiet
!python -m spacy download en_core_web_trf
!python -m spacy download en_core_web_lg


Collecting en-core-web-trf==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.0.0/en_core_web_trf-3.0.0-py3-none-any.whl (459.7 MB)
[K     |████████████████████████████████| 459.7 MB 17 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
Collecting en-core-web-lg==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.0.0/en_core_web_lg-3.0.0-py3-none-any.whl (778.8 MB)
[K     |████████████████████████████████| 778.8 MB 20 kB/s 
Installing collected packages: en-core-web-lg
  Attempting uninstall: en-core-web-lg
    Found existing installation: en-core-web-lg 3.2.0
    Uninstalling en-core-web-lg-3.2.0:
      Successfully uninstalled en-core-web-lg-3.2.0
Successfully installed en-core-web-lg-3.0.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [None]:
import spacy
print(spacy.__version__)
!python -m spacy validate


3.0.7
[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation: /usr/local/lib/python3.7/dist-packages/spacy[0m

NAME                  SPACY            VERSION                            
en_core_web_trf       >=3.0.0,<3.1.0   [38;5;2m3.0.0[0m   [38;5;2m✔[0m
en_core_sci_scibert   >=3.0.3,<3.1.0   [38;5;2m0.4.0[0m   [38;5;2m✔[0m
en_core_sci_lg        >=3.0.1,<3.1.0   [38;5;2m0.4.0[0m   [38;5;2m✔[0m
en_core_web_lg        >=3.0.0,<3.1.0   [38;5;2m3.0.0[0m   [38;5;2m✔[0m
en_core_web_sm        >=3.0.0,<3.1.0   [38;5;2m3.0.0[0m   [38;5;2m✔[0m



In [None]:
!pip install rapidfuzz --quiet
!pip freeze | grep rapidfuzz

rapidfuzz==1.8.3


In [None]:
from collections import Counter
from string import punctuation
from rapidfuzz import fuzz

nlp = spacy.load("en_core_web_lg")


**Single Keywords based on frequency**

In [None]:
def get_hotwords(text):
    result = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN'] # 1
    doc = nlp(text.lower()) # 2
    for token in doc:
        # 3
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        # 4
        if(token.pos_ in pos_tag):
            result.append(token.text)

    return result # 5

In [None]:
output = set(get_hotwords(doc))
hashtags = [ x[0] for x in Counter(output).most_common(10)]

print(hashtags)

['logistic', 'high', 'number', 'performance', 'trade', 'complex', 'lower', 'value', 'prediction', 'support']


**Based on Spacy chunks and string similarity**

In [None]:

def extract_keywords(nlp, sequence, special_tags : list = None, unique_keywords=True):
    """ Takes a Spacy core language model,
    string sequence of text and optional
    list of special tags as arguments.

    If any of the words in the string are
    in the list of special tags they are immediately
    added to the result.

    If check unique keywords required is True then only unique keywords
    else all keywords than apperar multiple times

    Arguments:
        sequence {str} -- string sequence to have keywords extracted from

    Keyword Arguments:
        tags {list} --  list of tags to be automatically added (default: {None})

    Returns:
        {list} -- list of the unique keywords extracted from a string
    """
    result = []

    # custom list of part of speech tags we are interested in
    # we are interested in proper nouns, nouns, and adjectives
    # edit this list of POS tags according to your needs.
    pos_tag = ['PROPN','NOUN','ADJ']

    # create a spacy doc object by calling the nlp object on the input sequence
    doc = nlp(sequence.lower())

    # if special tags are given and exist in the input sequence
    # add them to results by default
    if special_tags:
        tags = [tag.lower() for tag in special_tags]
        for token in doc:
            if token.text in tags:
                result.append(token.text)

    for chunk in doc.noun_chunks:
        final_chunk = ""
        for token in chunk:
            if (token.pos_ in pos_tag):
                final_chunk =  final_chunk + token.text + " "
        if final_chunk:
            result.append(final_chunk.strip())


    for token in doc:
        if (token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        if (token.pos_ in pos_tag):
            result.append(token.text)
    elements = list(set(result))
    if not unique_keywords:
      return elements
    results = [[name, [], 0] for name in elements]

    for (i, element) in enumerate(elements):
        for (j, choice) in enumerate(elements[i+1:]):
            if fuzz.ratio(element, choice, score_cutoff=90):
                results[i][2] += 1
                results[i][1].append(choice)
                results[j+i+1][2] += 1
                results[j+i+1][1].append(element)
    return list(
        map(
            lambda x:x[0],
            filter(
                lambda x: x[2]< 1, results
                )
            )
        )


In [None]:
print(extract_keywords(nlp,doc,unique_keywords=False))
print(extract_keywords(nlp,doc))


['trade', 'lower', 'complex algorithm', 'attributes', 'machine', 'training set', 'vector', 'nn', 'test', 'support vector machine algorithm', 'nearest', 'high variance', 'good', 'k nn', 'neighbors', 'overfitting', 'bias', 'supervised machine learning algorithm', 'k nearest neighbor algorithm', 'trees', 'bias variance trade off', 'decision trees', 'performance', 'prediction', 'c', 'goal', 'sensitivity', 'c parameter', 'training data', 'machine learning', 'complexity', 'training', 'violations', 'bias machine algorithms', 'margin', 'polynomial regression', 'algorithm', 'linear', 'low bias', 'variance', 'neighbor', 'learning', 'particular point', 'logistic', 'logistic regression variance', 'fewer attributes', 'value', 'complex', 'support', 'better', 'linear regression', 'tree', 'test set', 'point', 'turn', 'polynomial', 'low', 'good prediction performance', 'possible', 'data', 'decision tree', 'regression', 'fewer', 'high bias', 'depth', 'trade off', 'lower bias', 'high', 'number', 'set', '

In [None]:

'''You can use any spacy language or any size for vocab '''
nlp = spacy.load("en_core_web_sm")

def textProcessing(doc):
    '''Prepocessing of input text with
    1. tokenisation and Lemmatisation
    2. Removing stop words
    3. Creating and removing custom stop words.
    4. Generating required Vocabulary from input
    5. Preprocessing the input
    '''
    Nouns = []
    Noun_set = []
    trimmed_noun_set = []
    removing_duplicates = []
    arr = []
    vocab = []
    vocab_dict = {}

    doc = nlp(doc.upper())

    for possible_nouns in doc:
        if possible_nouns.pos_ in ["NOUN","PROPN"] :
            Nouns.append([possible_nouns , [child for child in possible_nouns.children]])


    for i,j in Nouns:
        for k in j:
            Noun_set.append([k,i])


    for i , j in Noun_set:
        if i.pos_ in ['PROPN','NOUN','ADJ']:
            trimmed_noun_set.append([i ,j])


    for word in trimmed_noun_set:
        if word not in removing_duplicates:
            removing_duplicates.append(word)


    for i in removing_duplicates:
        strs = ''
        for j in i:
            strs += str(j)+" "
        arr.append(strs.strip())


    for word in Noun_set:
        string = ''
        for j in word:
            string+= str(j)+ " "
        vocab.append(string.strip())


    for word in vocab:
        vocab_dict[word]= 0

    for word in arr:
        vocab_dict[word]+= 1

    return vocab_dict , arr

def computeTF(wordDict,bow):
    '''Computing TF(Term Frequency of the vocab) '''
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict


def computeIDF(doclist):
    '''Computing IDF for the vocab '''
    import math
    count = 0
    idfDict = {}
    for element in doclist:
        for j in element:
            count+=1
    N = count

    # count no of documents that contain the word w
    idfDict = dict.fromkeys(doclist[0].keys(),0)

    for doc in doclist:
        for word,val in doc.items():
            if val>0:
                idfDict[word]+= 1

    # divide N by denominator above
    for word,val in idfDict.items():
        if val == 0:
            idfDict[word] = 0.0
        else:
            idfDict[word] = math.log(N / float(val))

    return idfDict

def computeTfidf(tf,idf):
    '''Computing TF-IDF for the words in text '''
    tfidf = {}
    sorted_list = []
    for word , val in tf.items():
        tfidf[word] = val * idf[word]

    ranking_list  = sorted(tfidf.items(),reverse=True, key = lambda kv:(kv[1], kv[0]))[:10]
    for i, _ in ranking_list:
        sorted_list.append(i)

    return sorted_list

vocab_dict , arr = textProcessing(doc)
tf = computeTF(vocab_dict,arr)
idf = computeIDF([vocab_dict])
tfidf = computeTfidf(tf,idf)



**Keywords extraction using Tfidf**

In [None]:
print(tfidf)

['VARIANCE BIAS', 'LOW BIAS', 'HIGH VARIANCE', 'TRADE OFF', 'OFF BIAS', 'MACHINE ALGORITHM', 'LOW VARIANCE', 'LINEAR REGRESSION', 'LEARNING ALGORITHMS', 'ERROR INTRODUCED']


# Tfidf and Count vectors extract keywords

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer


stop_words = set(stopwords.words("english"))
corpus = doc.split()
cv =CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
X = cv.fit_transform(corpus)
list(cv.vocabulary_.keys())[:10]

#Most frequently occuring words
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in
                   vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1],
                       reverse=True)
    return words_freq[:n]
#Convert most freq words to dataframe for plotting bar plot
top_words = get_top_n_words(corpus, n=20)
print(top_words)


#Most frequently occuring Tri-grams
def get_top_n3_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(3,3),
           max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1],
                reverse=True)
    return words_freq[:n]
top3_words = get_top_n3_words(corpus, n=20)
print(top3_words)


[('the', 35), ('bias', 15), ('variance', 14), ('of', 10), ('and', 10), ('in', 9), ('to', 9), ('model', 8), ('high', 8), ('low', 7), ('can', 6), ('you', 6), ('algorithm', 5), ('machine', 5), ('regression', 5), ('is', 4), ('learning', 4), ('but', 4), ('your', 4), ('will', 4)]
[('under fitting low', 1)]


In [None]:
from scipy.sparse import coo_matrix


tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X)
# get feature names
feature_names=cv.get_feature_names()

# fetch document for which keywords needs to be extracted
doclist= corpus[-1]


#generate tf-idf for the given document
tf_idf_vector= tfidf_transformer.transform(cv.transform([doclist]))


def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""

    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    # word index and corresponding tf-idf score
    for idx, score in sorted_items:

        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]

    return results

sorted_items=sort_coo(tf_idf_vector.tocoo())
#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,5)

# now print the results
print("\nAbstract:")
print(doc)
print("\nKeywords:")
for k in keywords:
    print(k,keywords[k])

bias.

Abstract:
Bias is an error introduced in the model due to the oversimplification of the algorithm used (does
not fit the data properly). It can lead to under-fitting.Low bias machine learning algorithms — Decision Trees, k-NN and SVM High bias machine learning algorithms — Linear Regression, Logistic Regression
Variance: Variance is error introduced in the model due to a too complex algorithm, it performs very well in the training set but poorly in the test set. It can lead to high sensitivity and overfitting.
Possible high variance – polynomial regression Normally, as you increase the complexity of your model, you will see a reduction in error due to lower bias in the model. However, this only happens until a particular point. As you continue to make your model
more complex, you end up over-fitting your model and hence your model will start suffering from high
variance. Bias-Variance trade-off: The goal of any supervised machine learning algorithm is to have low bias and low va



In [None]:
from nltk import tokenize
from operator import itemgetter
import math
from nltk import tokenize
from operator import itemgetter
import math
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
stop_words = set(stopwords.words('english'))


In [None]:
total_words = doc.split()
total_word_length = len(total_words)
print(total_word_length)


330


In [None]:
tf_score = {}
for each_word in total_words:
    each_word = each_word.replace('.','')
    if each_word not in stop_words:
        if each_word in tf_score:
            tf_score[each_word] += 1
        else:
            tf_score[each_word] = 1

# Dividing by total_word_length for each dictionary element
tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())
print(tf_score)

{'Bias': 0.0030303030303030303, 'error': 0.00909090909090909, 'introduced': 0.006060606060606061, 'model': 0.021212121212121213, 'due': 0.00909090909090909, 'oversimplification': 0.0030303030303030303, 'algorithm': 0.012121212121212121, 'used': 0.0030303030303030303, '(does': 0.0030303030303030303, 'fit': 0.0030303030303030303, 'data': 0.00909090909090909, 'properly)': 0.0030303030303030303, 'It': 0.006060606060606061, 'lead': 0.006060606060606061, 'under-fittingLow': 0.0030303030303030303, 'bias': 0.03636363636363636, 'machine': 0.015151515151515152, 'learning': 0.012121212121212121, 'algorithms': 0.006060606060606061, '—': 0.006060606060606061, 'Decision': 0.0030303030303030303, 'Trees,': 0.0030303030303030303, 'k-NN': 0.0030303030303030303, 'SVM': 0.0030303030303030303, 'High': 0.0030303030303030303, 'Linear': 0.0030303030303030303, 'Regression,': 0.0030303030303030303, 'Logistic': 0.0030303030303030303, 'Regression': 0.0030303030303030303, 'Variance:': 0.0030303030303030303, 'Varia

In [None]:
def check_sent(word, sentences):
    final = [all([w in x for w in word]) for x in sentences]
    sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
    return int(len(sent_len))

In [None]:
idf_score = {}
for each_word in total_words:
    each_word = each_word.replace('.','')
    if each_word not in stop_words:
        if each_word in idf_score:
            idf_score[each_word] = check_sent(each_word, total_sentences)
        else:
            idf_score[each_word] = 1

# Performing a log and divide
idf_score.update((x, math.log(int(total_sent_len)/y)) for x, y in idf_score.items())

print(idf_score)
tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}
print(tf_idf_score)

{'Bias': 2.8903717578961645, 'error': 0.4054651081081644, 'introduced': 0.6931471805599453, 'model': 0.6931471805599453, 'due': 0.6931471805599453, 'oversimplification': 2.8903717578961645, 'algorithm': 0.6931471805599453, 'used': 2.8903717578961645, '(does': 2.8903717578961645, 'fit': 2.8903717578961645, 'data': 0.32542240043462795, 'properly)': 2.8903717578961645, 'It': 1.5040773967762742, 'lead': 0.32542240043462795, 'under-fittingLow': 2.8903717578961645, 'bias': 0.5877866649021191, 'machine': 0.6931471805599453, 'learning': 0.32542240043462795, 'algorithms': 0.6931471805599453, '—': 2.8903717578961645, 'Decision': 2.8903717578961645, 'Trees,': 2.8903717578961645, 'k-NN': 2.8903717578961645, 'SVM': 2.8903717578961645, 'High': 2.8903717578961645, 'Linear': 2.8903717578961645, 'Regression,': 2.8903717578961645, 'Logistic': 2.8903717578961645, 'Regression': 2.8903717578961645, 'Variance:': 2.8903717578961645, 'Variance': 2.8903717578961645, 'complex': 2.8903717578961645, 'algorithm,':

In [None]:
def get_top_n(dict_elem, n):
    result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n])
    return result

In [None]:
print(get_top_n(tf_idf_score, 10))

{'bias': 0.021374060541895237, '—': 0.01751740459331009, 'model': 0.014703122011877628, 'The': 0.014310024376376539, 'low': 0.010687030270947619, 'machine': 0.01050223000848402, 'number': 0.009987384442437361, 'It': 0.009115620586522873, 'Increasing': 0.009115620586522873, 'Bias': 0.008758702296655044}


{'bias': 0.021374060541895237, '—': 0.01751740459331009, 'model': 0.014703122011877628, 'The': 0.014310024376376539, 'low': 0.010687030270947619}


In [None]:
!pip install pytextrank --quiet
!pip freeze | grep pytextrank

pytextrank==3.2.2


In [None]:
import spacy
import pytextrank

# example text
text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types systems and systems of mixed types."

# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")
nla = nlp(doc)

# examine the top-ranked phrases in the document
for phrase in nla._.phrases:
    print(phrase.text, "-->", phrase.rank, phrase.count)

low bias --> 0.13502737571157575 4
lower bias --> 0.13502737571157575 1
low variance --> 0.13471824732751023 2
high bias --> 0.1312221452260158 1
high variance --> 0.13090403114174431 3
SVM High bias machine learning algorithms --> 0.12760797816537656 1
bias --> 0.11378941918354349 1
Variance --> 0.11337646420663995 4
variance --> 0.11337646420663995 1
algorithm --> 0.08782690144964946 2
algorithms --> 0.08782690144964946 1
machine learning --> 0.08547831300227005 1
high sensitivity --> 0.08108029276213435 1
the test set --> 0.0791164213631026 1
Logistic Regression --> 0.07647068396334362 1
good prediction performance --> 0.07499454333939176 1
polynomial regression --> 0.07431136984690367 1
the training set --> 0.07053512056036139 1
high
variance --> 0.0694223460787368 1
fewer attributes --> 0.06842893326123797 1
The support vector machine algorithm --> 0.061010813388273286 1
error --> 0.05650956963483114 2
Decision Trees --> 0.05461363100076441 2
Linear Regression --> 0.05461363100076

In [None]:
nlp = spacy.load("en_core_web_trf")
text = nlp(doc)
print(text.ents)


(1, 2, 3, 4)


In [None]:
!pip install scispacy --quiet
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz --quiet
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_scibert-0.4.0.tar.gz --quiet


[K     |████████████████████████████████| 423.3 MB 17 kB/s 
[K     |████████████████████████████████| 50 kB 2.5 MB/s 
[K     |████████████████████████████████| 50 kB 4.4 MB/s 
[K     |████████████████████████████████| 42 kB 1.0 MB/s 
[K     |████████████████████████████████| 2.6 MB 9.0 MB/s 
[K     |████████████████████████████████| 42 kB 926 kB/s 
[K     |████████████████████████████████| 40 kB 4.8 MB/s 
[?25h  Building wheel for en-core-sci-scibert (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-web-trf 3.2.0 requires spacy<3.3.0,>=3.2.0, but you have spacy 3.0.7 which is incompatible.
en-core-web-trf 3.2.0 requires spacy-transformers<1.2.0,>=1.1.2, but you have spacy-transformers 1.0.4 which is incompatible.[0m


In [None]:
import spacy
import scispacy
print(scispacy.__version__)
print(spacy.__version__)


0.4.0
3.0.7


In [None]:
nlp = spacy.load("en_core_sci_lg")
text = nlp(doc.lower())

In [None]:
keywords = set(str(i) for i in text.ents)
print(keywords)

{'increases', 'increasing', 'attributes', 'training set', 'data', 'overfitting', 'supervised', 'decreases', 'test set', 'poorly', 'training', 'k-nearest neighbor algorithm', 'violations', 'regression', 'influences', 'variance', 'trade-off', 'low', 'depth', 'complex', 'complexity', 'reduction', 'error', 'complex algorithm', 'algorithms', 'increase', 'decrease', 'high variance', 'decision trees', 'tree', 'over-fitting', 'features', 'relationship', 'low variance', 'neighbors', 'bias', 'logistic regression', 'prediction', 'margin', 'sensitivity', 'model', 'decision tree', 'machine learning', 'k-nn', 'oversimplification', 'linear regression', 'goal', 'polynomial regression', 'support vector machine', 'bias-variance', 'parameter', 'suffering', 'performance', 'algorithm'}


In [None]:
nlp = spacy.load("en_core_sci_scibert")
text = nlp(doc.lower())

In [None]:
keywords = set(str(i) for i in text.ents)
print(keywords)


{'increases', 'increasing', 'fewer', 'attributes', 'training set', 'data', 'overfitting', 'you', 'supervised', 'decreases', 'test set', 'training', 'machine', 'k-nearest neighbor algorithm', 'violations', 'regression', 'high\nvariance', 'influences', 'variance', 'low', 'trade-off', 'complex', 'fits', 'complexity', 'k', 'high bias', 'reduction', 'error', 'relationship', 'algorithms', 'increase', 'decrease', 'high variance', 'decision trees', 'tree', 'features', 'learning', 'low variance', 'neighbors', 'bias', 'prediction', 'margin', 'bias-variance trade-off', 'logistic regression\nvariance', 'sensitivity', 'model', 'decision tree', 'machine learning', 'k-nn', 'oversimplification', 'data properly', 'linear regression', 'goal', 'polynomial regression', 'support vector machine', 'happens', 'parameter', 'suffering', 'algorithm'}


# YAKE

In [None]:
!pip install git+https://github.com/LIAAD/yake --quiet

[?25l[K     |██▍                             | 10 kB 19.3 MB/s eta 0:00:01[K     |████▊                           | 20 kB 13.9 MB/s eta 0:00:01[K     |███████▏                        | 30 kB 9.3 MB/s eta 0:00:01[K     |█████████▌                      | 40 kB 5.6 MB/s eta 0:00:01[K     |████████████                    | 51 kB 6.7 MB/s eta 0:00:01[K     |██████████████▎                 | 61 kB 5.6 MB/s eta 0:00:01[K     |████████████████▊               | 71 kB 6.4 MB/s eta 0:00:01[K     |███████████████████             | 81 kB 6.2 MB/s eta 0:00:01[K     |█████████████████████▌          | 92 kB 5.8 MB/s eta 0:00:01[K     |███████████████████████▉        | 102 kB 6.4 MB/s eta 0:00:01[K     |██████████████████████████▎     | 112 kB 6.4 MB/s eta 0:00:01[K     |████████████████████████████▋   | 122 kB 6.4 MB/s eta 0:00:01[K     |███████████████████████████████ | 133 kB 6.4 MB/s eta 0:00:01[K     |████████████████████████████████| 137 kB 6.4 MB/s 
[?25h  Building 

In [None]:
import yake
print(yake.__version__)
kw_extractor = yake.KeywordExtractor()


0.4.8


In [None]:

language = "en"
max_ngram_size = 4
deduplication_threshold = 0.3
numOfKeywords = 30
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords)
keywords = custom_kw_extractor.extract_keywords(doc.lower())
for kw in keywords:
  print(kw)

('bias', 0.023339925765478872)
('variance', 0.02833245158214071)
('bias machine learning algorithms', 0.03301993784210874)
('model', 0.03838264592744803)
('high', 0.04335052204665673)
('low bias and high', 0.05835887956064701)
('error introduced', 0.05842037996741017)
('low', 0.06924869858798068)
('algorithm', 0.08404608142795815)
('regression', 0.08465486479638626)
('due', 0.10483435044944522)
('due to lower bias', 0.12138678247559685)
('data properly', 0.17820215949359544)
('trade-off', 0.1819920137101941)
('lead', 0.22456886218227773)
('set', 0.22692258387588043)
('oversimplification', 0.27310770180668203)
('complex', 0.2820663449772991)
('high sensitivity', 0.2941031429834685)
('changed', 0.31535211765289606)
('increases the number', 0.3259845658115017)
('suffering from high variance', 0.4077320844299825)
('k-nn and svm', 0.45532921020819445)
('over-fitting your model', 0.5026159506314043)
('decision trees', 0.5071480064680455)
('vector machine', 0.5624896780896452)
('performs', 0.

# Python keyword using Rake

In [None]:
!pip install rake-nltk --quiet


[K     |████████████████████████████████| 1.5 MB 5.3 MB/s 
[K     |████████████████████████████████| 749 kB 43.9 MB/s 
[?25h

In [None]:
!pip freeze | grep rake

rake-nltk==1.0.6


In [None]:
from rake_nltk import Rake
rake_nltk_var = Rake(punctuations=string.punctuation,min_length=1, max_length=10,include_repeated_phrases=False)
rake_nltk_var.extract_keywords_from_text(doc)
keywords = rake_nltk_var.get_ranked_phrases()
print(keywords)

['svm high bias machine learning algorithms — linear regression', 'low bias machine learning algorithms — decision trees', 'possible high variance – polynomial regression normally', 'supervised machine learning algorithm', 'support vector machine algorithm', 'achieve good prediction performance', 'machine learning', 'use another regression', 'logistic regression variance', 'linear regression', 'use fewer attributes', 'nearest neighbor algorithm', 'high bias', 'data properly ).', 'low bias', 'high variance', 'low variance', 'decision tree', 'high sensitivity', 'lower bias', 'decrease bias', 'algorithm used', 'complex algorithm', 'variance trade', 'training set', 'training data', 'test set', 'start suffering', 'particular point', 'margin allowed', 'error introduced', 'error due', 'c parameter', 'better fits', 'bias', 'turn increases', 'model due', 'variance', 'prediction', 'data', 'tree', 'trade', 'model', 'increases', 'decrease', 'complex', 'well', 'violations', 'value', 'see', 'relatio

[]


#Key Words Extract from Genesim

In [None]:
import gensim
from gensim.summarization import keywords
print(gensim.__version__)

3.6.0


In [None]:
print(keywords(doc, pos_filter=None))

algorithm
algorithms
variance
bias
machine
high
regression
fewer
good prediction
data


# Key Bert

In [None]:
!pip install keybert --quiet


In [None]:
from keybert import KeyBERT
import keybert
print(keybert.__version__)
kw_model = KeyBERT()

0.5.0


In [None]:
keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range=(2, 5),stop_words='english',
                              nr_candidates=20, top_n=20,use_mmr=True, diversity=0.7)
print(keywords)

[('machine learning algorithm low bias', 0.6407), ('variance increasing variance', 0.2238), ('point continue make model complex', 0.057), ('fits data escaping relationship', 0.141), ('number neighbors', -0.0056), ('used does', 0.0448), ('poorly test set lead high', 0.1691), ('polynomial regression normally increase complexity', 0.4003), ('performs training', 0.1248), ('decrease depth tree use', 0.1222), ('regression logistic regression', 0.2172), ('margin allowed', 0.0295), ('fewer attributes', 0.1551), ('error lower', 0.1342), ('high sensitivity overfitting possible', 0.3773), ('parameter influences', 0.1458), ('start suffering', -0.0935), ('relationship bias', 0.3579), ('support vector', 0.1497), ('particular point', 0.0989)]


# matching one paragraph keywords into another paragraph

In [None]:
from nltk.tokenize import MWETokenizer
from nltk import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')
import string


In [None]:

mwe = MWETokenizer([k.lower().split() for k in keywords], separator='_')
# Clean out the punctuations in your sentence.
puncts = list(string.punctuation + "—")
cleaned_paragraph = ' '.join([ch if ch not in puncts else '' for ch in word_tokenize(doc.lower())])


tokenized_keyword_paragraph = [token for token in mwe.tokenize(word_tokenize(cleaned_paragraph))
                       if token.replace('_', ' ') in keywords]
# print(tokenized_paragraph)
# for token in mwe.tokenize(word_tokenize(cleaned_paragraph)):
#   if token not in tokenized_paragraph:
#     print(token)
