In [None]:
!pip install sumy

In [5]:
import pandas as pd
import numpy as np
from sumy.summarizers.lsa import LsaSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

# Pre-processing

In [6]:
dataset = pd.read_csv('/content/drive/MyDrive/ir_datasets/cord19.csv')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192509 entries, 0 to 192508
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  192509 non-null  int64 
 1   docno       192509 non-null  object
 2   title       192459 non-null  object
 3   abstract    137644 non-null  object
dtypes: int64(1), object(3)
memory usage: 5.9+ MB


In [7]:
dataset.drop(columns=['Unnamed: 0'], inplace=True)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192509 entries, 0 to 192508
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   docno     192509 non-null  object
 1   title     192459 non-null  object
 2   abstract  137644 non-null  object
dtypes: object(3)
memory usage: 4.4+ MB


In [8]:
dataset.drop_duplicates(inplace=True)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192100 entries, 0 to 192508
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   docno     192100 non-null  object
 1   title     192050 non-null  object
 2   abstract  137485 non-null  object
dtypes: object(3)
memory usage: 5.9+ MB


In [9]:
dataset.dropna(inplace=True)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137479 entries, 0 to 192506
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   docno     137479 non-null  object
 1   title     137479 non-null  object
 2   abstract  137479 non-null  object
dtypes: object(3)
memory usage: 4.2+ MB


In [None]:
dataset.to_csv('/content/drive/MyDrive/ir_datasets/cord19_final/cord19.csv')

# Summary

In [None]:
# One time installation
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/ir_datasets/cord19_final/cord19.csv')
dataset.info()

In [14]:
# Creating text parser using tokenization
def get_summary(text, summarizer_lsa):
  parser = PlaintextParser.from_string(text, Tokenizer("english"))
  summary_2 = summarizer_lsa(parser.document, 2)
  dp = []
  for i in summary_2:
    lp = str(i)
    dp.append(lp)
  final_sentence = ' '.join(dp)
  return final_sentence

In [None]:
summarizer_lsa = LsaSummarizer()
dataset['summary'] = ''
for index, row in dataset.iterrows():
    dataset.at[index, 'summary'] = get_summary(row['abstract'], summarizer_lsa)
dataset.to_csv('/content/drive/MyDrive/ir_datasets/cord19_final/cord19_sum.csv')

In [None]:
dataset.info()

In [None]:
dataset.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True)

In [None]:
dataset.drop_duplicates(inplace=True)
dataset.info()

In [None]:
dataset.dropna(inplace=True)
dataset.to_csv('/content/drive/MyDrive/ir_datasets/cord19_final/cord19_sumy.csv')

# KeyPhrasification

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/ir_datasets/cord19_final/cord19_sumy.csv')
dataset.info()

In [18]:
def getKeys(extractor, text):
    extractor.load_document(input=text, language='en')

    # identify the keyphrase candidates using TopicRank's default strategy
    # i.e. the longest sequences of nouns and adjectives `(Noun|Adj)*`
    extractor.candidate_selection()

    # identifying keyphrase candidates populates the extractor.candidates dictionary
    # let's have a look at the keyphrase candidates
    # for each keyphrase candidate
    # In TopicRank, candidate weighting is a three-step process:
    #  1. candidate clustering (grouping keyphrase candidates into topics)
    #  2. graph construction (building a complete-weighted-graph of topics)
    #  3. rank topics (nodes) using a random walk algorithm
    extractor.candidate_weighting()

    # Get the N-best candidates (here, 5) as keyphrases
    keyphrases = extractor.get_n_best(n=5, stemming=False)
    keyphrasesList = []
    for i, (candidate, score) in enumerate(keyphrases):
        keyphrasesList.append(candidate)
        print()
    return keyphrasesList

In [None]:
!pip install git+https://github.com/boudinfl/pke.git

In [20]:
import pke

In [None]:
extractor = pke.unsupervised.TopicRank()
dataset['KeyList'] = ''

for index, row in dataset.iterrows():
    dataset.at[index, 'KeyList'] = getKeys(extractor, row['abstract'])
dataset.to_csv('/content/drive/MyDrive/ir_datasets/cord19_final/cord19_key.csv')

# Context Extraction

In [None]:
import pandas as pd
import re
dataset = pd.read_csv('/content/drive/MyDrive/ir_datasets/cord19_final/cord19_key_clean.csv')
dataset.info()

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm

In [None]:
# function to extract 5 words before and after a given word
def extract_context(text, keyword):
    # split the text into sentences using NLTK's sent_tokenize() function
    sentences = sent_tokenize(text)
    # iterate through each sentence
    context = []
    for sentence in sentences:
        # split the sentence into words using NLTK's word_tokenize() function
        words = word_tokenize(sentence)
        # check if the keyword is present in the sentence
        if keyword in words:
            # find the index of the keyword in the sentence
            keyword_index = words.index(keyword)
            # find the start and end indices for the context
            start_index = max(0, keyword_index - 5)
            end_index = min(len(words), keyword_index + 6)
            # extract the context and join the words together
            context.append(' '.join(words[start_index:end_index]))
    # join the contexts for each sentence together
    return ' '.join(context) if context else ''

In [None]:
import ast
for index, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
  # print('Type of record: ', type(row['KeyList']))
  keylist = ast.literal_eval(row['KeyList'])
  for i in range(len(keylist)): 
    dataset.at[index, 'context_key_' + str(i + 1)] = extract_context(row['abstract'], keylist[i])

In [None]:
dataset.drop(dataset[dataset['KeyList'] == '[]'].index, inplace=True)

In [None]:
dataset[dataset['docno']== 'vw8xjo9t']

In [None]:
dataset.drop(columns=['Unnamed: 0'], inplace = True)

In [None]:
dataset.fillna('', inplace=True)

In [None]:
dataset.to_csv('/content/drive/MyDrive/ir_datasets/cord19_final/cord19_context.csv', index=False)

In [None]:
dataset['abstract'][0]