In [207]:
import csv
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from sklearn.cluster import KMeans
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import contractions
import string
import collections
import operator

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gokul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
xml_file = ET.parse('mendeley_document_library_2020-03-25.xml')

In [121]:
#function to remove the HTML tag with abstract
import re

TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [163]:
# Extracting the necessary fields from the xml file to a data frame

columns=['abstract', 'ref-type', 'title', 'secondary_title', 'full_title', 'year', 'label', 'keyword']
df = pd.DataFrame(columns=columns)
record_dict = dict()
for record in records[0].findall('record'):
    try:
        record_dict['abstract'] = remove_tags(record.findall('./abstract')[0].text)
    except:
        record_dict['abstract'] = None
    try:
        record_dict['ref-type'] = record.findall('./ref-type')[0].attrib.get('name')
    except:
        record_dict['ref-type'] = None
    try:
        record_dict['title'] = record.findall('./titles')[0].find('title').text
    except:
        record_dict['title'] = None
    try:
        record_dict['secondary_title'] = record.findall('./titles')[0].find('./secondary-title').text
    except:
        record_dict['secondary_title'] = None
    try:
        record_dict['full_title'] = record.findall('./periodical')[0].find('./full-title').text
    except:
        record_dict['full_title'] = None
    try:
        record_dict['year'] = record.findall('./dates')[0].find('./year').text
    except:
        record_dict['year'] = None
    try:
        record_dict['label'] = record.findall('label')[0].text.replace(";", " ")
    except:
        record_dict['label'] = None
    kywrd = ''
    for keyword in record.find('./keywords'):
        kywrd= kywrd + " "+keyword.text
    record_dict['keyword'] = kywrd
    #print(record_dict)
    df = df.append(record_dict, ignore_index=True)
    #print(df)

In [164]:
# checking the data frame
print(df.shape)

(1061, 8)


In [165]:
df.head()

Unnamed: 0,abstract,ref-type,title,secondary_title,full_title,year,label,keyword
0,,Journal Article,Calling all coronavirus researchers: keep shar...,Nature,Nature,2020,coronavirus reviewed,Genomics Health care Infection Virology
1,Background : The current novel coronavirus ou...,Journal Article,The transmissibility of novel Coronavirus in t...,Wellcome Open Research,Wellcome Open Research,2020,coronavirus reviewed,coronavirus modelling outbreak transmission w...
2,The outbreak of the coronavirus disease 2019 (...,Journal Article,Development of CRISPR as a prophylactic strate...,bioRxiv,bioRxiv,2020,biorxiv coronavirus not added reviewed,
3,"Background: on the late December 2019, a new e...",Journal Article,Design of multi epitope-based peptide vaccine ...,bioRxiv,bioRxiv,2020,biorxiv coronavirus reviewed,
4,,Journal Article,A Literature Review of 2019 Novel Coronavirus ...,,,2020,coronavirus reviewed,2019-nCoV causes epidemiology prevention and ...


In [166]:
# data frame information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1061 entries, 0 to 1060
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   abstract         851 non-null    object
 1   ref-type         1061 non-null   object
 2   title            1061 non-null   object
 3   secondary_title  1013 non-null   object
 4   full_title       1013 non-null   object
 5   year             1042 non-null   object
 6   label            1049 non-null   object
 7   keyword          1061 non-null   object
dtypes: object(8)
memory usage: 66.4+ KB


In [244]:
# Since dealing with documents with little informations is tricky, I'm
# leaving out documents with null abstract
df_nabs = df[df.abstract.notnull()]

In [226]:
df_nabs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 851 entries, 1 to 1060
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   abstract         851 non-null    object
 1   ref-type         851 non-null    object
 2   title            851 non-null    object
 3   secondary_title  819 non-null    object
 4   full_title       819 non-null    object
 5   year             843 non-null    object
 6   label            839 non-null    object
 7   keyword          851 non-null    object
dtypes: object(8)
memory usage: 59.8+ KB


In [245]:
# Expanding compressed english word combo
df_nabs['abstract'] = df_nabs['abstract'].apply(lambda x: [contractions.fix(word) for word in x.split()])

# Combining the list of words back 
df_nabs['abstract']= df_nabs['abstract'].apply(lambda x: ' '.join(map(str, x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [246]:
# tokenize each abstract
tokenizer = RegexpTokenizer(r'\w+')
df_nabs['abstract'] = df_nabs['abstract'].apply(lambda x: tokenizer.tokenize(x.lower()))
df_nabs.head(20).abstract

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


1     [background, the, current, novel, coronavirus,...
2     [the, outbreak, of, the, coronavirus, disease,...
3     [background, on, the, late, december, 2019, a,...
5     [global, airline, networks, play, a, key, role...
6     [the, beginning, of, 2020, has, seen, the, eme...
7     [the, beginning, of, 2020, has, seen, the, eme...
9     [covid, 19, caused, by, a, novel, coronavirus,...
10    [summary, objective, to, describe, the, epidem...
11    [as, of, 8am, 30th, january, beijing, time, 20...
12    [the, outbreak, of, pneumonia, caused, by, a, ...
14    [in, december, 2019, a, novel, coronavirus, ca...
16    [background, recent, epidemic, of, novel, coro...
17    [40, days, after, the, start, of, the, interna...
18    [two, months, after, it, was, firstly, reporte...
19    [our, society, is, currently, experiencing, an...
20    [as, the, coronavirus, covid, 19, expands, its...
22    [on, 31, december, 2019, a, cluster, of, 27, p...
24    [since, the, first, suspected, case, of, n

In [249]:
# punctuation and stop words are removed from the abstracts
df_nabs['abstract'] = df_nabs['abstract'].apply(lambda x: [word for word in x if word not in string.punctuation])
stop_words = set(stopwords.words('english'))
df_nabs['abstract'] = df_nabs['abstract'].apply(lambda x: [word for word in x if word not in stop_words])
df_nabs['abstract'] = df_nabs['abstract'].apply(lambda x: [word for word in x if word not in ['coronavirus','outbreak', 'background','novel', 'covid', 'cov','time', 'disease']])
df_nabs['abstract'].head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


1     [current, appears, originated, point, source, ...
2     [2019, 19, caused, severe, acute, respiratory,...
3     [late, december, 2019, new, endemic, spread, a...
5     [global, airline, networks, play, key, role, g...
6     [beginning, 2020, seen, emergence, 19, caused,...
7     [beginning, 2020, seen, emergence, 19, caused,...
9     [19, caused, sars, 2, emerged, wuhan, hubei, p...
10    [summary, objective, describe, epidemiological...
11    [8am, 30th, january, beijing, 2020, approximat...
12    [pneumonia, caused, 2019, ncov, wuhan, city, c...
14    [december, 2019, called, 19, discovered, wuhan...
16    [recent, epidemic, sars, 2, triggered, rising,...
17    [40, days, start, international, monitoring, 1...
18    [two, months, firstly, reported, 19, already, ...
19    [society, currently, experiencing, unprecedent...
20    [19, expands, impact, china, expanding, catchm...
22    [31, december, 2019, cluster, 27, pneumonia, c...
24    [since, first, suspected, case, 2019, ncov

In [250]:
# lemmatizing the words 
lemmatizer = WordNetLemmatizer() 
df_nabs['abstract'] = df_nabs['abstract'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x]).apply(lambda x: ' '.join(map(str, x)))
df_nabs['abstract'].head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


1     current appears originated point source exposu...
2     2019 19 caused severe acute respiratory syndro...
3     late december 2019 new endemic spread across w...
5     global airline network play key role global im...
6     beginning 2020 seen emergence 19 caused severe...
7     beginning 2020 seen emergence 19 caused severe...
9     19 caused sars 2 emerged wuhan hubei province ...
10    summary objective describe epidemiological cli...
11    8am 30th january beijing 2020 approximate 8000...
12    pneumonia caused 2019 ncov wuhan city china ob...
14    december 2019 called 19 discovered wuhan china...
16    recent epidemic sars 2 triggered rising global...
17    40 day start international monitoring 19 searc...
18    two month firstly reported 19 already spread w...
19    society currently experiencing unprecedented c...
20    19 expands impact china expanding catchment su...
22    31 december 2019 cluster 27 pneumonia case unk...
24    since first suspected case 2019 ncov infec

In [251]:
# converting abstracts to matrix of TF-IDF features
vectorizer = TfidfVectorizer(max_df=0.5, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, ngram_range=(1,3))
X = vectorizer.fit_transform(df_nabs.abstract)

In [255]:
# classification using KMeans
number_of_clusters = 5
model = KMeans(n_clusters=number_of_clusters, init='k-means++', max_iter=100, n_init=1)

model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=5, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [256]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

In [257]:
#Printing out the clusters/ classifications
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

Cluster 0:
 number
 transmission
 health
 province
 science
 epidemic
 wuhan
 statement author
 2020
 competing funding
Cluster 1:
 ncov
 2019 ncov
 wuhan
 virus
 sars
 infection
 pneumonia
 transmission
 health
 respiratory
Cluster 2:
 sars
 virus
 respiratory
 infection
 severe
 result
 method
 analysis
 based
 infected
Cluster 3:
 severe
 pneumonia
 wuhan
 day
 result
 finding
 method
 confirmed
 2020
 science
Cluster 4:
 model
 epidemic
 number
 based
 infected
 2020
 infection
 spread
 rate
 virus
