# From abstract to dataframe of sentences

## Import libraries

In [1]:
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")

## Loading dataframe with abstracts

In [2]:
# Read 'df_HCQ.json': df_HCQ
df_HCQ = pd.read_json("df_HCQ.json")
df_HCQ.head()

Unnamed: 0,Publication ID,title,abstract
0,pub.1126880632,COVID-19 and what pediatric rheumatologists sh...,"'On March 11th, 2020 the World Health Organiza..."
1,pub.1127834352,Hydroxychloroquine or chloroquine with or with...,'BACKGROUND: Hydroxychloroquine or chloroquine...
2,pub.1126667578,Hydroxychloroquine in patients mainly with mil...,'Abstract Objectives To assess the efficacy an...
3,pub.1125404383,Of chloroquine and COVID-19,'Recent publications have brought attention to...
4,pub.1127182972,An independent appraisal and re-analysis of hy...,'A recent open-label study claimed that hydrox...


In [3]:
df_HCQ.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17 entries, 0 to 16
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Publication ID  17 non-null     object
 1   title           17 non-null     object
 2   abstract        17 non-null     object
dtypes: object(3)
memory usage: 544.0+ bytes


## Abstract

In [4]:
# choose one abstract
item = 3
title = df_HCQ["title"].iloc[item]
abstract = df_HCQ["abstract"].iloc[item]

title

'Of chloroquine and COVID-19'

In [5]:
abstract

"'Recent publications have brought attention to the possible benefit of chloroquine, a broadly used antimalarial drug, in the treatment of patients infected by the novel emerged coronavirus (SARS-CoV-2). The scientific community should consider this information in light of previous experiments with chloroquine in the field of antiviral research.'"

## Dataframe of sentences

In [6]:
# Select dataframe
dframe = df_HCQ

# Make a list that contains all rows for the later dataframe: row_list
# One row for each sentence
row_list = []

for item in range(len(dframe)):
    # Select row from dataframe 'dframe': row
    row = dframe.iloc[item]

    # Get abstract from that row: abstract
    abstract = row["abstract"]

    # Make Doc-object from abstract: doc_abstract
    doc_abstract = nlp(abstract)

    # Make a list of the sentences in 'doc_abstract': sentences_list
    sentences_list = [sentence for sentence in doc_abstract.sents]

    for number in range(len(sentences_list)):
        # List of entries for one single row: entry_list
        # Every row should contain: an unique identifier for each sentence ('sentence_ID'), the sentence itself,
        # the publiction ID and title of the publication and the abstract from which the sentence is taken.
        entry_list = []
    
        sentence_ID = row["Publication ID"] + "-s" + str(number)
        sent = sentences_list[number].text
        publication_ID = row["Publication ID"]
        title= row["title"]
    
    
        entry_list.append(sentence_ID)
        entry_list.append(sent)
        entry_list.append(publication_ID)
        entry_list.append(title)
        entry_list.append(abstract)
    
        row_list.append(entry_list)

# From 'row_list' make dataframe: sentencesDframe
columns = ["Sentence ID", "sentence", "Publication ID", "title", "abstract"]
sentencesDframe = pd.DataFrame(row_list, columns=columns)

In [7]:
sentencesDframe.head()

Unnamed: 0,Sentence ID,sentence,Publication ID,title,abstract
0,pub.1126880632-s0,"'On March 11th, 2020 the World Health Organiza...",pub.1126880632,COVID-19 and what pediatric rheumatologists sh...,"'On March 11th, 2020 the World Health Organiza..."
1,pub.1126880632-s1,"The infection, transmitted by 2019 novel coron...",pub.1126880632,COVID-19 and what pediatric rheumatologists sh...,"'On March 11th, 2020 the World Health Organiza..."
2,pub.1126880632-s2,"Italy was early and severely involved, with a ...",pub.1126880632,COVID-19 and what pediatric rheumatologists sh...,"'On March 11th, 2020 the World Health Organiza..."
3,pub.1126880632-s3,Person-to-person spread mainly occurs via resp...,pub.1126880632,COVID-19 and what pediatric rheumatologists sh...,"'On March 11th, 2020 the World Health Organiza..."
4,pub.1126880632-s4,The median incubation period is 5 days.,pub.1126880632,COVID-19 and what pediatric rheumatologists sh...,"'On March 11th, 2020 the World Health Organiza..."


In [8]:
# Export 'sentencesDframe' as .json-file: sentences_dframe.json
sentencesDframe.to_json("sentences_dframe.json")