## Import Libraries

In [1]:
import pandas as pd
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
from collections import Counter

## Dataframe from json

In [2]:
df_sentences = pd.read_json("sentences_dframe.json")
df_sentences.head()

Unnamed: 0,Sentence ID,sentence,Publication ID,title,abstract
0,pub.1126880632-s0,"On March 11th, 2020 the World Health Organizat...",pub.1126880632,COVID-19 and what pediatric rheumatologists sh...,"On March 11th, 2020 the World Health Organizat..."
1,pub.1126880632-s1,"The infection, transmitted by 2019 novel coron...",pub.1126880632,COVID-19 and what pediatric rheumatologists sh...,"On March 11th, 2020 the World Health Organizat..."
2,pub.1126880632-s2,"Italy was early and severely involved, with a ...",pub.1126880632,COVID-19 and what pediatric rheumatologists sh...,"On March 11th, 2020 the World Health Organizat..."
3,pub.1126880632-s3,Person-to-person spread mainly occurs via resp...,pub.1126880632,COVID-19 and what pediatric rheumatologists sh...,"On March 11th, 2020 the World Health Organizat..."
4,pub.1126880632-s4,The median incubation period is 5 days.,pub.1126880632,COVID-19 and what pediatric rheumatologists sh...,"On March 11th, 2020 the World Health Organizat..."


In [3]:
df_sentences.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 211 entries, 0 to 210
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Sentence ID     211 non-null    object
 1   sentence        211 non-null    object
 2   Publication ID  211 non-null    object
 3   title           211 non-null    object
 4   abstract        211 non-null    object
dtypes: object(5)
memory usage: 9.9+ KB


## List of lemma

In [4]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "VERB"}]
matcher.add("verb_id", None, pattern)

verbs_lemma_list = []

for sent in df_sentences["sentence"]:
    doc = nlp(sent)
    matches = matcher(doc)
    
    for verb_id, start, end in matches:
        verbs_lemma_list.append(doc[start:end].lemma_)

In [5]:
len(verbs_lemma_list)

414

In [6]:
cnt = Counter(verbs_lemma_list)
cnt

Counter({'declare': 2,
         'transmit': 1,
         'discover': 2,
         'spread': 3,
         'involve': 2,
         'occur': 4,
         'may': 13,
         'range': 1,
         'depend': 2,
         'underlie': 4,
         'record': 2,
         'affect': 2,
         'relate': 1,
         'understand': 3,
         'issue': 2,
         'help': 2,
         'care': 1,
         'use': 19,
         'appear': 2,
         'promise': 2,
         'seem': 1,
         'drive': 2,
         'expect': 1,
         'play': 2,
         'support': 5,
         'treat': 8,
         'take': 3,
         'offer': 1,
         'evaluate': 7,
         'comprise': 1,
         'include': 12,
         'hospitalise': 3,
         'find': 6,
         'receive': 16,
         'hydroxychloroquine': 2,
         'form': 1,
         'initiate': 1,
         'exclude': 1,
         'sustain': 1,
         'mean': 1,
         'meet': 1,
         'die': 3,
         'control': 4,
         'compare': 5,
         'chloroqu

In [7]:
len(cnt)

184

In [8]:
verbs = [i for i in cnt.keys()]

In [9]:
print(verbs)

['declare', 'transmit', 'discover', 'spread', 'involve', 'occur', 'may', 'range', 'depend', 'underlie', 'record', 'affect', 'relate', 'understand', 'issue', 'help', 'care', 'use', 'appear', 'promise', 'seem', 'drive', 'expect', 'play', 'support', 'treat', 'take', 'offer', 'evaluate', 'comprise', 'include', 'hospitalise', 'find', 'receive', 'hydroxychloroquine', 'form', 'initiate', 'exclude', 'sustain', 'mean', 'meet', 'die', 'control', 'compare', 'chloroquine', 'associate', 'increase', 'confirm', 'decrease', 'assess', 'set', 'designate', 'hospitalize', 'assign', 'administrate', 'follow', 'remain', 'analyze', 'accord', 'manage', 'onset', 'recipient', 'report', 'result', 'moderate', 'compete', 'submit', 'declaration', 'obtain', 'archive', 'must', 'register', 'approve', 'provide', 'post', 'explain', 'upload', 'can', 'make', 'sign', 'should', 'direct', 'bring', 'infect', 'emerge', 'consider', 'claim', 'represent', 'reveal', 'yield', 'indicate', 'regard', 'know', 'observe', 'overcome', 'per

## Lemma-dataframe

In [10]:
df_lemma_list = []

for i in range(len(df_sentences)):
    doc = nlp(df_sentences["sentence"].iloc[i])
    matches = matcher(doc)
            
    for verb_id, start, end in matches:
        row = []
        row.append(df_sentences["Sentence ID"].iloc[i])
        row.append(df_sentences["sentence"].iloc[i])
        row.append(doc[start:end].lemma_)
        row.append(df_sentences["Publication ID"].iloc[i])
        row.append(df_sentences["title"].iloc[i])
    
        df_lemma_list.append(row)

columns = ["Sentence ID", "sentence", "lemma", "Publication ID", "title"]
df_lemma = pd.DataFrame(df_lemma_list, columns=columns)

In [11]:
df_lemma.head()

Unnamed: 0,Sentence ID,sentence,lemma,Publication ID,title
0,pub.1126880632-s0,"On March 11th, 2020 the World Health Organizat...",declare,pub.1126880632,COVID-19 and what pediatric rheumatologists sh...
1,pub.1126880632-s1,"The infection, transmitted by 2019 novel coron...",transmit,pub.1126880632,COVID-19 and what pediatric rheumatologists sh...
2,pub.1126880632-s1,"The infection, transmitted by 2019 novel coron...",discover,pub.1126880632,COVID-19 and what pediatric rheumatologists sh...
3,pub.1126880632-s1,"The infection, transmitted by 2019 novel coron...",spread,pub.1126880632,COVID-19 and what pediatric rheumatologists sh...
4,pub.1126880632-s2,"Italy was early and severely involved, with a ...",involve,pub.1126880632,COVID-19 and what pediatric rheumatologists sh...


In [12]:
df_lemma.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Sentence ID     414 non-null    object
 1   sentence        414 non-null    object
 2   lemma           414 non-null    object
 3   Publication ID  414 non-null    object
 4   title           414 non-null    object
dtypes: object(5)
memory usage: 16.3+ KB


In [13]:
df_lemma["lemma"].value_counts()

use          19
receive      16
may          13
include      12
treat         8
             ..
bring         1
base          1
establish     1
range         1
explain       1
Name: lemma, Length: 184, dtype: int64

### Example: rows with lemma "support"

In [14]:
df_support = df_lemma.loc[df_lemma["lemma"] == "support", ["lemma", "Sentence ID", "sentence"]]
df_support

Unnamed: 0,lemma,Sentence ID,sentence
25,support,pub.1126880632-s11,Pediatric rheumatologists are expected to play...
101,support,pub.1126667578-s27,This work was supported by the Emergent Projec...
217,support,pub.1126655433-s11,These results do not support the use of HCQ in...
313,support,pub.1127408847-s8,We sought evidence to support or refute the hy...
329,support,pub.1127408847-s19,The findings support the hypothesis that these...


In [15]:
df_support["sentence"].iloc[2]

'These results do not support the use of HCQ in patients hospitalised for documented SARS-CoV-2-positive hypoxic pneumonia.'

In [16]:
df_support["sentence"].iloc[4]

'The findings support the hypothesis that these drugs have efficacy in the treatment of COVID-19.'

In [17]:
df_support["sentence"].iloc[3]

'We sought evidence to support or refute the hypothesis that these drugs could show efficacy in the treatment of COVID-19.\nMATERIALS AND METHODS: We reviewed in vitro studies, in vivo studies, original studies, clinical trials, and consensus reports, that were conducted to evaluate the antiviral activities of chloroquine and hydroxychloroquine.'

#### For today's meeting

In [18]:
exmpl_sent = df_support["sentence"].iloc[4]
exmpl_sent

'The findings support the hypothesis that these drugs have efficacy in the treatment of COVID-19.'

In [19]:
exmpl_doc = nlp(exmpl_sent)

for token in exmpl_doc:
    print(token.text, token.lemma_, token.pos_, token.shape_)

The the DET Xxx
findings finding NOUN xxxx
support support VERB xxxx
the the DET xxx
hypothesis hypothesis NOUN xxxx
that that SCONJ xxxx
these these DET xxxx
drugs drug NOUN xxxx
have have AUX xxxx
efficacy efficacy NOUN xxxx
in in ADP xx
the the DET xxx
treatment treatment NOUN xxxx
of of ADP xx
COVID-19 covid-19 NOUN XXXX-dd
. . PUNCT .
