In [14]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import joblib
import csv
import pickle
import PySimpleGUI as sg
import warnings
warnings.simplefilter('ignore')

In [15]:
df = pd.read_csv('Data_Big.csv')
df = df.where(pd.notnull(df), "None")
df[["Paper", "Description"]]

In [16]:
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")

In [17]:
def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [18]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in df.Author:
    allwords_stemmed = tokenize_and_stem(i)
    totalvocab_stemmed.extend(allwords_stemmed)
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [19]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [20]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.25, max_features=200000,
                                 min_df=0.05, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,5))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(df.Author)

print(tfidf_matrix.shape)

CPU times: user 181 ms, sys: 132 µs, total: 181 ms
Wall time: 179 ms
(258, 20)


In [21]:
terms = tfidf_vectorizer.get_feature_names()

In [605]:
num_clusters = 3

km = KMeans(n_clusters=num_clusters, random_state = 3)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

Wall time: 4.88 s


In [611]:
joblib.dump(km,  'cluster_model.pkl')

['doc_cluster.pkl']

In [464]:
km = joblib.load('cluster_model.pkl')
clusters = km.labels_.tolist()

In [606]:
df2 = df
df2["clusters"] = clusters
df2

Unnamed: 0,Authors,Paper,Publish Date,Description,clusters
0,"Timothy James Mason, J Phillip Lorimer",Sonochemistry,1988,,0
1,"Timothy J Mason, J Phillip Lorimer",Applied sonochemistry: the uses of power ultra...,06/05/2002,Power ultrasound has been used for many years ...,2
2,"Timothy J Mason, Larysa Paniwnyk, JP Lorimer",The uses of ultrasound in food technology,01/11/1996,The same physical and mechanical effects which...,2
3,"Timothy J Mason, Dietmar Peters",Practical sonochemistry,1991,This updated version of Practical Sonochemistr...,2
4,Timothy J Mason,Ultrasound in synthetic organic chemistry,01/01/1997,High-power ultrasound can generate cavitation ...,2
...,...,...,...,...,...
7763,"C Sabourin, J Merelo, K Madani, K Warwick",[BOOK] Computational Intelligence,2019,,0
7764,"C Sabourin, J Merelo, K Madani, K Warwick",computational Intelligence,2019,,0
7765,"Huma Shah, Kevin Warwick",Trust and Decision Making in Turing's Imitatio...,2019,Trust is an expected certainty in order to tra...,2
7766,Kevin Warwick,28 Smart Machines ARE,43308,Over the years numerous people have expressed ...,2


In [607]:
df2["clusters"].value_counts()

2    4681
0    2461
1     626
Name: clusters, dtype: int64

In [608]:
df3 = df2.set_index("clusters")

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :8]:
        print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print()

Top terms per cluster:

Cluster 0 words: b'research', b'provides', b'available', b'make', b'presents', b'subject', b'diseased', b's',

Cluster 1 words: b'patients', b'cases', b'grouped', b'years', b'aged', b'diseased', b'presents', b'report',

Cluster 2 words: b'effects', b'results', b'different', b'control', b'development', b'methods', b'model', b'processing',





In [609]:
df4 = df2
df4['clusters'] = df4['clusters'].map({1: "Health", 2:'Politics', 0:'Sports'}) 

df4

Unnamed: 0,Authors,Paper,Publish Date,Description,clusters
0,"Timothy James Mason, J Phillip Lorimer",Sonochemistry,1988,,Unclassified
1,"Timothy J Mason, J Phillip Lorimer",Applied sonochemistry: the uses of power ultra...,06/05/2002,Power ultrasound has been used for many years ...,Engineering
2,"Timothy J Mason, Larysa Paniwnyk, JP Lorimer",The uses of ultrasound in food technology,01/11/1996,The same physical and mechanical effects which...,Engineering
3,"Timothy J Mason, Dietmar Peters",Practical sonochemistry,1991,This updated version of Practical Sonochemistr...,Engineering
4,Timothy J Mason,Ultrasound in synthetic organic chemistry,01/01/1997,High-power ultrasound can generate cavitation ...,Engineering
...,...,...,...,...,...
7763,"C Sabourin, J Merelo, K Madani, K Warwick",[BOOK] Computational Intelligence,2019,,Unclassified
7764,"C Sabourin, J Merelo, K Madani, K Warwick",computational Intelligence,2019,,Unclassified
7765,"Huma Shah, Kevin Warwick",Trust and Decision Making in Turing's Imitatio...,2019,Trust is an expected certainty in order to tra...,Engineering
7766,Kevin Warwick,28 Smart Machines ARE,43308,Over the years numerous people have expressed ...,Engineering


In [610]:
print('''Narrow your search using the following syntax, or use none to search all fields:
Search Authors - Author: Mr A
Search Titles - Title: How do Search Engines Work?
Search Publication Dates - Date: 2021
Search Descriptions - Desc: A paper detailing...
Search Class - Class: Health
''')

INP  = input("Enter Search Term: ")

if "author:" in INP.lower():
    INP = INP.lower().replace("author:", "").strip()
    display(df4[df4["Authors"].str.lower().str.contains(INP)])
    
elif "title:" in INP.lower():
    INP = INP.lower().replace("title:", "").strip()
    display(df4[df4["Paper"].str.lower().str.contains(INP)])
    
elif "date:" in INP.lower():
    INP = INP.lower().replace("date:", "").strip()
    display(df4[df4["Publish Date"].str.lower().str.contains(INP)])
    
elif "desc:" in INP.lower():
    INP = INP.lower().replace("desc:", "").strip()
    display(df4[df4["Description"].str.lower().str.contains(INP)])
    
elif "class:" in INP.lower():
    INP = INP.lower().replace("class:", "").strip()
    display(df4[df4["clusters"].str.lower().str.contains(INP)])
    
else:
    temp = pd.DataFrame()
    INP = INP.lower().strip().split(" ")
    for w in INP:
        for c in df.columns:
            temp = temp.append(df4[df4[c].str.lower().str.contains(w)])
    temp = temp[~temp.index.duplicated(keep="first")].sort_index()
    display(temp)

Narrow your search using the following syntax, or use none to search all fields:
Search Authors - Author: Mr A
Search Titles - Title: How do Search Engines Work?
Search Publication Dates - Date: 2021
Search Descriptions - Desc: A paper detailing...
Search Class - Class: Health

Enter Search Term: Class: Health


Unnamed: 0,Authors,Paper,Publish Date,Description,clusters
508,"Theoklis E Zaoutis, Monika Goyal, Jaclyn H Chu...",Risk factors for and outcomes of bloodstream i...,01/04/2005,Objective. The increasing prevalence of infect...,Health
513,"Lewis L Judd, Pamela J Schettler, E Sherwood B...",Adverse consequences of glucocorticoid medicat...,2014/10,Glucocorticoids are the most commonly prescrib...,Health
514,"Namrata Sharma, Praful Maharana, Gurnarinder S...",Pseudomonas keratitis after collagen crosslink...,01/03/2010,A 19-year-old woman presented with a 3-day his...,Health
521,"Santanu Goswami, Surendra K Mattoo, Debasish B...",Substance-abusing schizophrenics: do they self...,01/01/2004,In spite of having been formulated nearly two ...,Health
546,"Gagandeep Singh, Jeremy H Rees, Josemir W Sander",Seizures and epilepsy in oncological practice:...,01/04/2007,There are few data available on the causes and...,Health
...,...,...,...,...,...
7255,"Carmen Camara, Kevin Warwick, Ricardo Bruña, T...",A fuzzy inference system for closed-loop deep ...,2015/11,Parkinsons disease is a complex neurodegenera...,Health
7264,"Eduard Bakstein, Jonathan Burgess, Kevin Warwi...",Parkinsonian tremor identification with multip...,41136,This paper explores the development of multi-f...,Health
7355,"Eduard Bakstein, Kevin Warwick, Jonathan Burge...",Features for detection of parkinson's disease ...,40422,Deep Brain Stimulation (DBS) is a treatment ro...,Health
7411,"James Geddes, Kevin Warwick",Cloud based global positioning system as a saf...,40422,The aim of using GPS for Alzheimer's Patients ...,Health
