# 2 Sentences

## 0 Import Libraries

In [1]:
import pandas as pd
import spacy

In [2]:
# Import English Library
nlp = spacy.load("en_core_web_lg", disable=["ner"])

## 1 Load Dataframe

In [3]:
abstract_df = pd.read_json("../data/HCQ_clean_abstracts.json")
abstract_df.head(3)

Unnamed: 0,Publication ID,title,abstract_clean
0,pub.1126880632,COVID-19 and what pediatric rheumatologists sh...,"On March 11th, 2020 the World Health Organizat..."
1,pub.1127834352,Hydroxychloroquine or chloroquine with or with...,"BACKGROUND: Hydroxychloroquine or chloroquine,..."
2,pub.1126667578,Hydroxychloroquine in patients mainly with mil...,Abstract Objectives To assess the efficacy and...


## 2 New Dataframe: From Abstracts to Sentences

In [4]:
# Show index of 'abstract_df' (these are the row numbers)
abstract_df.index

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], dtype='int64')

In [5]:
# Make a dataframe that
## has a row for each sentence
## assigns a unique id to each sentence
## assigns the title of publication to each sentence

def single_sentences(dataframe):            # this function is applied on a dataframe
    data_list = []                          # create empty list-object: data_list
      
    for row_number in dataframe.index:      # for-loop iterates over index of all row numbers
        sentence_number = 0                 # set counter
        
        for sentence in nlp(dataframe["abstract_clean"].iloc[row_number]).sents:                    # for-loop iterates over
                                                                                                    # all sentences in the
                                                                                                    # nlp-object's 'sents'-
                                                                                                    # attribute
            
            sentence_id = dataframe["Publication ID"].iloc[row_number] + "-" + str(sentence_number) # Make sentence ID from
                                                                                                    # Publication ID and
                                                                                                    # sentence number
            
            data_list.append([sentence_id, dataframe["title"].iloc[row_number], sentence.text])
            
            
            sentence_number += 1                                                                    # increase counter
                                                                                                    # ('sentence_number')
                                                                                                    # by 1
            
    new_dataframe = pd.DataFrame(data_list, columns=["sentence_id", "title", "sentence"])           # Make new dataframe
                                                                                                    # from 'data_list'
    
    return new_dataframe

In [6]:
# Create a dataframe that contains in each row one single sentence
# and its corresponding title and sentence ID as 
# its unique identifier: sentences_df
sentences_df = single_sentences(abstract_df)

In [7]:
sentences_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216 entries, 0 to 215
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sentence_id  216 non-null    object
 1   title        216 non-null    object
 2   sentence     216 non-null    object
dtypes: object(3)
memory usage: 5.2+ KB


In [8]:
sentences_df.head()

Unnamed: 0,sentence_id,title,sentence
0,pub.1126880632-0,COVID-19 and what pediatric rheumatologists sh...,"On March 11th, 2020 the World Health Organizat..."
1,pub.1126880632-1,COVID-19 and what pediatric rheumatologists sh...,"The infection, transmitted by 2019 novel coron..."
2,pub.1126880632-2,COVID-19 and what pediatric rheumatologists sh...,"Italy was early and severely involved, with a ..."
3,pub.1126880632-3,COVID-19 and what pediatric rheumatologists sh...,Person-to-person spread mainly occurs via resp...
4,pub.1126880632-4,COVID-19 and what pediatric rheumatologists sh...,The median incubation period is 5 days.


In [9]:
# Save 'sentences_df' as .json-file: 'HCQ_sentences.json'
sentences_df.to_json("../data/HCQ_sentences.json")

## 3 Labeling Sentences Manually

In [10]:
# Load 'sentences_df' again
sentences_df = pd.read_json("../data/HCQ_sentences.json")

In [11]:
sentences_df.head()

Unnamed: 0,sentence_id,title,sentence
0,pub.1126880632-0,COVID-19 and what pediatric rheumatologists sh...,"On March 11th, 2020 the World Health Organizat..."
1,pub.1126880632-1,COVID-19 and what pediatric rheumatologists sh...,"The infection, transmitted by 2019 novel coron..."
2,pub.1126880632-2,COVID-19 and what pediatric rheumatologists sh...,"Italy was early and severely involved, with a ..."
3,pub.1126880632-3,COVID-19 and what pediatric rheumatologists sh...,Person-to-person spread mainly occurs via resp...
4,pub.1126880632-4,COVID-19 and what pediatric rheumatologists sh...,The median incubation period is 5 days.


**For the following steps see:** 
Youtube: "Intro to NLP with spaCy (3): Detecting programming languages | Episode 3: Evaluation"

[https://youtu.be/4V0JDdohxAk](https://youtu.be/4V0JDdohxAk)

In [12]:
# Export column 'sentence' of dataframe 'sentences_df' as Excel spread sheet
sentences_df["sentence"].to_excel("../labeling/sentences_to_be_labeled.xlsx")