In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing

df = pd.read_csv('papers.csv')


In [2]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer



stop_words = set(stopwords.words('english'))
##Creating a list of custom stopwords
new_words = ["fig","figure","image","sample","using",
             "show", "result", "large",
             "also", "one", "two", "three",
             "four", "five", "seven","eight","nine"]
stop_words = list(stop_words.union(new_words))


def pre_process(text):
    # lowercase
    text = text.lower()

    #remove tags
    text = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", text)

    # remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ", text)

    ##Convert to list from string
    text = text.split()

    # remove stopwords
    text = [word for word in text if word not in stop_words]

    # remove words less than three letters
    text = [word for word in text if len(word) >= 3]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    text = [lmtzr.lemmatize(word) for word in text]

    return ' '.join(text)


docs = df['paper_text'].apply(lambda x: pre_process(x))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
#docs = docs.tolist()
#create a vocabulary of words,
cv=CountVectorizer(max_df=0.95,         # ignore words that appear in 95% of documents
                   max_features=10000,  # the size of the vocabulary
                   ngram_range=(1,3)    # vocabulary contains single words, bigrams, trigrams
                  )
word_count_vector=cv.fit_transform(docs)


In [4]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

In [5]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""

    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]

        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        # feature_vals.append(feature_names[idx])
        feature_vals.append(fname)

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]

    return results

# get feature names
feature_names=cv.get_feature_names()

def get_keywords(idx, statement):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([statement[idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)

    return keywords

def print_results(idx,keywords, df):
    # now print the results
    print("\n=====Title=====")
    print(df['title'][idx])
    print("\n=====Abstract=====")
    print(df['abstract'][idx])
    print("\n===Keywords===")
    for k in keywords:
        # print(k,keywords[k])
        print(k)
idx=941
keywords=get_keywords(idx, docs)
print_results(idx,keywords, df)


=====Title=====
Algorithms for Non-negative Matrix Factorization

=====Abstract=====
Non-negative matrix factorization (NMF) has previously been shown to 
be a useful decomposition for multivariate data. Two different multi- 
plicative algorithms for NMF are analyzed. They differ only slightly in 
the multiplicative factor used in the update rules. One algorithm can be 
shown to minimize the conventional least squares error while the other 
minimizes the generalized Kullback-Leibler divergence. The monotonic 
convergence of both algorithms can be proven using an auxiliary func- 
tion analogous to that used for proving convergence of the Expectation- 
Maximization algorithm. The algorithms can also be interpreted as diag- 
onally rescaled gradient descent, where the rescaling factor is optimally 
chosen to ensure convergence. 

===Keywords===
update rule 0.344
update 0.285
auxiliary 0.212
non negative matrix 0.21
negative matrix 0.209
rule 0.192
nmf 0.183
multiplicative 0.175



In [6]:
import pickle
# pickle_out = open("predict.pkl","wb")
# pickle.dump(get_keywords, pickle_out)
# pickle_out.close()
keywords_prediction = get_keywords(941,"")

In [94]:
document = { "title":["Mine home"], "abstract":["Temi gjfhgdsghs"],"paper_text":["What is a Design Pattern? Design patterns are design level solutions for recurring problems that we software engineers come across often. It’s not code - I repeat, ❌CODE. It is like a description on how to tackle these problems and design a solution.Using these patterns is considered good practice, as the design of the solution is quite tried and tested, resulting in higher readability of the final code. Design patterns are quite often created for and used by OOP Languages, like Java, in which most of the examples from here on will be written.Types of design patternsThere are about 26 Patterns currently discovered (I hardly think I will do them all…).These 26 can be classified into 3 types:1. Creational: These patterns are designed for class instantiation. They can be either class-creation patterns or object-creational patterns.2. Structural: These patterns are designed with regard to a class's structure and composition. The main goal of most of these patterns is to increase the functionality of the class(es) involved, without changing much of its composition.3. Behavioral: These patterns are designed depending on how one class communicates with others.In this post, we will go through one basic design pattern for each classified type."]}
test = document['paper_text']
# pre_process(document[0])

# get_keywords(0,test)
print_results(0,get_keywords(0,test),document)


=====Title=====
Mine home

=====Abstract=====
Temi gjfhgdsghs

===Keywords===
design 0.549
composition 0.257
class 0.244
designed 0.234
code 0.221
classified 0.218
es 0.185
creation 0.159
quite 0.144
one 0.143


In [101]:


pickle.dump(document,open('generate.pkl','wb'))

text = document['paper_text']
get_keywords(0,text)

{'design': 0.549,
 'composition': 0.257,
 'class': 0.244,
 'designed': 0.234,
 'code': 0.221,
 'classified': 0.218,
 'es': 0.185,
 'creation': 0.159,
 'quite': 0.144,
 'one': 0.143}

In [96]:
document = pickle.load(open('generate.pkl', 'rb'))

In [97]:
import pickle
with open('testing.pkl','wb')as f:
    pickle.dump(testing,f)