# Set up

In [None]:
!pip install transformers[torch]

Collecting transformers[torch]
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m34.1 MB/s

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import pandas as pd
import json
import torch
from transformers import BertTokenizer, BertForTokenClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset, random_split
# from tqdm import tqdm
import re
from nltk.tokenize import sent_tokenize
import spacy
from sklearn.metrics import mean_squared_error

# Data Processing - csv

In [180]:
train = pd.read_csv('drive/My Drive/train.csv')
test = pd.read_csv('drive/My Drive/test.csv')
val = pd.read_csv('drive/My Drive/val.csv')

In [181]:
# Fill in missing pmid in csv
train.at[622, 'pmid'] = '19957776'

In [182]:
# Cannot set as int for 'PMC3872591'
train['pmid'] = train['pmid'].astype(str)
test['pmid'] = test['pmid'].astype(str)
val['pmid'] = val['pmid'].astype(str)

In [183]:
data = pd.concat([train, test, val], axis=0)
data = data.reset_index(drop=True)

Unnamed: 0,question,pmid,input_text,target_text,Adaptation_Version,Question_Type


In [184]:
data

Unnamed: 0,question,pmid,input_text,target_text,Adaptation_Version,Question_Type
0,1,29857264,Exercise-Associated Muscle Cramps (EAMC) are a...,Exercise-Associated Muscle Cramps (EAMC) are a...,2,C
1,1,33722257,"Background: Muscle cramp is a painful, involun...",Muscle cramps are unconscious contractions of ...,2,C
2,1,31696455,Muscle cramp is a temporary but intense and pa...,"Muscle cramp is a temporary but intense, painf...",2,C
3,1,30168894,Muscular cramp is a common symptom in healthy ...,"Muscle cramps are common in healthy people, es...",2,C
4,1,29763070,"Muscle cramps result in continuous, involuntar...",Muscle cramps cause constant and unintended co...,2,C
...,...,...,...,...,...,...
916,66,35102405,Nephrotic syndrome (NS) encompasses a variety ...,Nephrotic syndrome (NS) includes a variety of ...,2,B
917,66,35017338,Children with nephrotic syndrome (NS) have a n...,Nephrotic Syndrome (NS) is a combination of ki...,2,B
918,66,34979093,Background: Although venous thromboembolism is...,Blood clots are a well-known problem associate...,2,B
919,66,34839817,Background: Steroid resistant nephrotic syndro...,Steroid Resistant Nephrotic Syndrome (SRNS) is...,2,B


In [185]:
data[data['pmid'] == '33347023']

# dup pmid - 2 adaptations
# !!!version_id unable to map

Unnamed: 0,question,pmid,input_text,target_text,Adaptation_Version,Question_Type
791,2,33347023,Purpose: Several studies have previously repor...,"Dry eye, depression, and treatent of depressio...",2,B
792,2,33347023,Purpose: Several studies have previously repor...,Studies show that there is a link between depr...,2,B


# Data Processing - json

In [186]:
file_path = 'drive/My Drive/data.json'
with open(file_path, 'r') as file:
    dj = json.load(file)

In [187]:
dj

{'1': {'15902691': {'Title': 'Muscle cramps',
   'abstract': {'1': 'Muscle cramps are a common problem characterized by a sudden, painful, involuntary contraction of muscle.',
    '2': 'These true cramps, which originate from peripheral nerves, may be distinguished from other muscle pain or spasm.',
    '3': 'Medical history, physical examination, and a limited laboratory screen help to determine the various causes of muscle cramps.',
    '4': 'Despite the "benign" nature of cramps, many patients find the symptom very uncomfortable.',
    '5': 'Treatment options are guided both by experience and by a limited number of therapeutic trials.',
    '6': 'Quinine sulfate is an effective medication, but the side-effect profile is worrisome, and other membrane-stabilizing drugs are probably just as effective.',
    '7': 'Patients will benefit from further studies to better define the pathophysiology of muscle cramps and to find more effective medications with fewer side-effects.'},
   'adaptat

In [188]:
question_list = []

for question_id, question in dj.items():
    row = {
        'QID': question_id,
        'Question': question['question'],
        'Question_Type': question['question_type']
    }

    question_list.append(row)

df_q = pd.DataFrame(question_list)

In [189]:
df_q

# 75 questions

Unnamed: 0,QID,Question,Question_Type
0,1,What causes muscle spasm?,C
1,2,What does duloxetine do?,B
2,3,How can i reduce my potassium levels?,C
3,4,How is diabetes diagnosed?,C
4,5,How to treat a bakers cyst?,C
...,...,...,...
70,71,What happens in the body to cause achondroplasia?,B
71,72,What is newborn metabolic screening? what can ...,C
72,73,In adult patients with total hip replacements ...,C
73,74,Why do i have to constantly clear my throat an...,C


In [None]:
# dj_list = []

# for question_id, question in dj.items():
#   for pmid, text_data in question.items():
#     if isinstance(text_data, dict):
#       row = {
#         'QID': question_id,
#         'Question': question['question'],
#         'Question_Type': question['question_type'],
#         'pmid': pmid,
#         'Title': text_data.get('Title', None),
#       }

#       abstract = text_data.get('abstract', {})
#       adaptations = text_data.get('adaptations', {})

#       for sentence, text in abstract.items():
#         row[f'Abstract_{sentence}'] = text

#       for adaptation_id, adaptation_text in adaptations.items():
#         row[f'Adaptation_{adaptation_id}'] = adaptation_text

#       dj_list.append(row)

# df_a = pd.DataFrame(dj_list)

In [None]:
# df_a

# 749 abstracts ??? vs 750

Unnamed: 0,QID,Question,Question_Type,pmid,Title,Abstract_1,Abstract_2,Abstract_3,Abstract_4,Abstract_5,...,Abstract_25,Abstract_26,Abstract_27,Abstract_28,Abstract_29,Abstract_30,Abstract_31,Abstract_32,Abstract_33,Abstract_34
0,1,What causes muscle spasm?,C,15902691,Muscle cramps,Muscle cramps are a common problem characteriz...,"These true cramps, which originate from periph...","Medical history, physical examination, and a l...","Despite the ""benign"" nature of cramps, many pa...",Treatment options are guided both by experienc...,...,,,,,,,,,,
1,1,What causes muscle spasm?,C,25432724,Diagnosis and treatment of dystonia,The dystonias are a group of disorders charact...,A careful assessment of the clinical manifesta...,"If a cause is identified, specific etiology-ba...","In most cases, a specific cause cannot be iden...","Treatment options include counseling, educatio...",...,,,,,,,,,,
2,1,What causes muscle spasm?,C,29763070,Muscle Cramps,"Muscle cramps result in continuous, involuntar...","Generally, the cramp can last from minutes to ...",Palpating the muscle area of the cramp will pr...,Exercise-associated muscle cramps are the most...,The specific etiology is not well understood a...,...,,,,,,,,,,
3,1,What causes muscle spasm?,C,29857264,Muscle cramps: A comparison of the two-leading...,Exercise-Associated Muscle Cramps (EAMC) are a...,Despite scientists tried to understand the phy...,"From 1900 to nowadays, the scientific world re...","However, recent literature seems to focus on t...",The aim of this review is to examine the recen...,...,,,,,,,,,,
4,1,What causes muscle spasm?,C,30168894,Muscular cramp: causes and management,Muscular cramp is a common symptom in healthy ...,It is prominent in a number of benign neurolog...,It is a particular feature of chronic neurogen...,A literature review was undertaken to understa...,Many aspects of cramping remain incompletely u...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
744,75,What is a gene affected by sickle cell anemia?,B,31322815,Combined and differential effects of alpha-tha...,Background: Our objective was to investigate t...,Procedure: Steady-state biological parameters ...,The age of the first hospitalized VOC was also...,These data were correlated with the alpha-glob...,"For the latter, three different genetic loci w...",...,,,,,,,,,,
745,75,What is a gene affected by sickle cell anemia?,B,32447424,Functional polymorphisms of BCL11A and HBS1L-M...,Fetal hemoglobin (HbF) ameliorates clinical se...,The major loci regulating HbF levels are HBB c...,"However, the impact of noncoding single-nucleo...","Therefore, we performed comprehensive associat...",We found SNPs independently associated with Hb...,...,,,,,,,,,,
746,75,What is a gene affected by sickle cell anemia?,B,32772141,"Association between BCL11A, HSB1L-MYB, and Xmn...",Sickle cell disease (SCD) is a monogenic disea...,Inter-individual variability in hemoglobin F (...,"HbF levels are affected by, among other factor...",Our aim was to investigate HbF-enhancer haplot...,The study included 100 SCD patients and 100 ma...,...,,,,,,,,,,
747,75,What is a gene affected by sickle cell anemia?,B,33072979,"Sickle Cell Disease-Genetics, Pathophysiology,...",Sickle cell disease (SCD) is a monogenetic dis...,Phenotypic variation in the clinical presentat...,Understanding the pathogenesis and pathophysio...,In this special edition for newborn screening ...,Through a systematic review of the literature ...,...,,,,,,,,,,


In [None]:
# df_a.columns

Index(['QID', 'Question', 'Question_Type', 'pmid', 'Title', 'Abstract_1',
       'Abstract_2', 'Abstract_3', 'Abstract_4', 'Abstract_5', 'Abstract_6',
       'Abstract_7', 'Adaptation_adaptation2', 'Abstract_8', 'Abstract_9',
       'Abstract_10', 'Abstract_11', 'Abstract_12', 'Abstract_13',
       'Abstract_14', 'Abstract_15', 'Abstract_16', 'Abstract_17',
       'Abstract_18', 'Adaptation_adaptation3', 'Abstract_19', 'Abstract_20',
       'Abstract_21', 'Adaptation_adaptation1', 'Abstract_22', 'Abstract_23',
       'Abstract_24', 'Abstract_25', 'Abstract_26', 'Abstract_27',
       'Abstract_28', 'Abstract_29', 'Abstract_30', 'Abstract_31',
       'Abstract_32', 'Abstract_33', 'Abstract_34'],
      dtype='object')

In [None]:
# df_j
# df_t

Unnamed: 0,Abstract_sentence_id,Abstract,Adaptation_sentence_id,Adaptation
0,7,Patients will benefit from further studies to ...,7,More studies are needed to better define the e...
1,6,A substantial reduction in symptoms and improv...,6,A noticeable decrease in symptoms and improved...
2,7,A cramp is almost never a local effect but inv...,7,A cramp can almost never be explained just by ...
3,11,"In conclusion, from the latest investigations ...",11,"In summary, the signal causing muscles to cont..."
4,6,Current treatment options are correspondingly ...,6,Current treatment is limited.
...,...,...,...,...
744,10,They should be studied and interpreted togethe...,10,These genes and their mutations should be stud...
745,13,Higher HbF concentration may underlie this eff...,13,Higher HbF levels may be the cause of this eff...
746,11,Discovery of the molecular mechanisms controll...,11,If researchers could discover the underlying c...
747,8,We also provide an overview of emerging therap...,8,This paper also discusses new therapies for Si...


In [None]:
# df_merge = pd.concat([df_j, df_t], axis=1)
# df_merge

Unnamed: 0,QID,Question,Question_Type,pmid,Title,adaptation_id,Abstract_sentence_id,Abstract,Adaptation_sentence_id,Adaptation
0,1,What causes muscle spasm?,C,15902691,Muscle cramps,adaptation2,7,Patients will benefit from further studies to ...,7,More studies are needed to better define the e...
1,1,What causes muscle spasm?,C,25432724,Diagnosis and treatment of dystonia,adaptation2,6,A substantial reduction in symptoms and improv...,6,A noticeable decrease in symptoms and improved...
2,1,What causes muscle spasm?,C,29763070,Muscle Cramps,adaptation2,7,A cramp is almost never a local effect but inv...,7,A cramp can almost never be explained just by ...
3,1,What causes muscle spasm?,C,29857264,Muscle cramps: A comparison of the two-leading...,adaptation2,11,"In conclusion, from the latest investigations ...",11,"In summary, the signal causing muscles to cont..."
4,1,What causes muscle spasm?,C,30168894,Muscular cramp: causes and management,adaptation2,6,Current treatment options are correspondingly ...,6,Current treatment is limited.
...,...,...,...,...,...,...,...,...,...,...
744,75,What is a gene affected by sickle cell anemia?,B,31322815,Combined and differential effects of alpha-tha...,adaptation3,10,They should be studied and interpreted togethe...,10,These genes and their mutations should be stud...
745,75,What is a gene affected by sickle cell anemia?,B,32447424,Functional polymorphisms of BCL11A and HBS1L-M...,adaptation3,13,Higher HbF concentration may underlie this eff...,13,Higher HbF levels may be the cause of this eff...
746,75,What is a gene affected by sickle cell anemia?,B,32772141,"Association between BCL11A, HSB1L-MYB, and Xmn...",adaptation3,11,Discovery of the molecular mechanisms controll...,11,If researchers could discover the underlying c...
747,75,What is a gene affected by sickle cell anemia?,B,33072979,"Sickle Cell Disease-Genetics, Pathophysiology,...",adaptation3,8,We also provide an overview of emerging therap...,8,This paper also discusses new therapies for Si...


In [190]:
dj_list = []
io_list = []

a_list = []
p_list = []

for question_id, question in dj.items():
  for pmid, text_data in question.items():
    if isinstance(text_data, dict):


      abstract = text_data.get('abstract', {})
      adaptations = text_data.get('adaptations', {})

      for adaptation_id, adaptation in adaptations.items():
        summary = {
          'QID': question_id,
          'Question': question['question'],
          'Question_Type': question['question_type'],
          'pmid': pmid,
          'Title': text_data.get('Title', None),
          'adaptation_id': int(adaptation_id[10:]),
          'abstract_len': len(abstract)
        }
        pi = ''
        po = ''

        for  text_input, text_output in zip(abstract.items(), adaptation.items()):

          text = {
            'Abstract_sentence_id': text_input[0],
            'Abstract': text_input[1],
            'Adaptation_sentence_id': text_output[0],
            'Adaptation': text_output[1]
          }

          dj_list.append(summary)
          io_list.append(text)

          pi += text_input[1]
          po += text_output[1]


        paragraph = {
            'input_paragraph': pi,
            'output_paragraph': po
        }
        a_list.append(summary)
        p_list.append(paragraph)

df_j = pd.DataFrame(dj_list)
df_t = pd.DataFrame(io_list)


df_s = pd.DataFrame(a_list)
df_p = pd.DataFrame(p_list)

In [191]:
df_t

Unnamed: 0,Abstract_sentence_id,Abstract,Adaptation_sentence_id,Adaptation
0,1,Muscle cramps are a common problem characteriz...,1,Muscle cramps are a common problem represented...
1,2,"These true cramps, which originate from periph...",2,"These true cramps, coming from nerves outside ..."
2,3,"Medical history, physical examination, and a l...",3,"Medical history, physical check-up, and lab sc..."
3,4,"Despite the ""benign"" nature of cramps, many pa...",4,"Despite their harmless nature, cramps are unco..."
4,5,Treatment options are guided both by experienc...,5,Experience and limited medical studies guide t...
...,...,...,...,...
9313,1,Sickle cell anemia (SCA) is a disease characte...,1,Sickle Cell Anemia (SCA) is a genetic blood di...
9314,2,Because of their effects on HbS polymerization...,2,Due to their effects on sickle hemoglobin (HbS...
9315,3,The aim of our study was to determine if the n...,3,The aim of our study was to find out if the nu...
9316,4,Our results confirmed that alpha-thalassemia p...,4,Results showed that alpha-thalassemia protecte...


In [192]:
df_jsent = pd.concat([df_j, df_t], axis=1)
df_jsent

Unnamed: 0,QID,Question,Question_Type,pmid,Title,adaptation_id,abstract_len,Abstract_sentence_id,Abstract,Adaptation_sentence_id,Adaptation
0,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,1,Muscle cramps are a common problem characteriz...,1,Muscle cramps are a common problem represented...
1,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,2,"These true cramps, which originate from periph...",2,"These true cramps, coming from nerves outside ..."
2,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,3,"Medical history, physical examination, and a l...",3,"Medical history, physical check-up, and lab sc..."
3,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,4,"Despite the ""benign"" nature of cramps, many pa...",4,"Despite their harmless nature, cramps are unco..."
4,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,5,Treatment options are guided both by experienc...,5,Experience and limited medical studies guide t...
...,...,...,...,...,...,...,...,...,...,...,...
9313,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,1,Sickle cell anemia (SCA) is a disease characte...,1,Sickle Cell Anemia (SCA) is a genetic blood di...
9314,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,2,Because of their effects on HbS polymerization...,2,Due to their effects on sickle hemoglobin (HbS...
9315,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,3,The aim of our study was to determine if the n...,3,The aim of our study was to find out if the nu...
9316,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,4,Our results confirmed that alpha-thalassemia p...,4,Results showed that alpha-thalassemia protecte...


In [193]:
df_jsent.to_csv('json_sent.csv', index=False)

In [194]:
df_p

Unnamed: 0,input_paragraph,output_paragraph
0,Muscle cramps are a common problem characteriz...,Muscle cramps are a common problem represented...
1,The dystonias are a group of disorders charact...,Dystonias are disorders with a lot of uncontro...
2,"Muscle cramps result in continuous, involuntar...",Muscle cramps cause constant and unintended co...
3,Exercise-Associated Muscle Cramps (EAMC) are a...,Exercise-Associated Muscle Cramps (EAMC) are a...
4,Muscular cramp is a common symptom in healthy ...,"Muscle cramps are common in healthy people, es..."
...,...,...
914,Background: Our objective was to investigate t...,Researchers analyzed the effects of an alpha-t...
915,Fetal hemoglobin (HbF) ameliorates clinical se...,Fetal hemoglobin (HbF) makes the severity of S...
916,Sickle cell disease (SCD) is a monogenic disea...,Sickle Cell Disease (SCD) is a genetic blood d...
917,Sickle cell disease (SCD) is a monogenetic dis...,Sickle Cell Disease (SCD) is a genetic disorde...


In [195]:
df_jpara = pd.concat([df_s, df_p], axis=1)
df_jpara

Unnamed: 0,QID,Question,Question_Type,pmid,Title,adaptation_id,abstract_len,input_paragraph,output_paragraph
0,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,Muscle cramps are a common problem characteriz...,Muscle cramps are a common problem represented...
1,1,What causes muscle spasm?,C,25432724,Diagnosis and treatment of dystonia,2,6,The dystonias are a group of disorders charact...,Dystonias are disorders with a lot of uncontro...
2,1,What causes muscle spasm?,C,29763070,Muscle Cramps,2,7,"Muscle cramps result in continuous, involuntar...",Muscle cramps cause constant and unintended co...
3,1,What causes muscle spasm?,C,29857264,Muscle cramps: A comparison of the two-leading...,2,11,Exercise-Associated Muscle Cramps (EAMC) are a...,Exercise-Associated Muscle Cramps (EAMC) are a...
4,1,What causes muscle spasm?,C,30168894,Muscular cramp: causes and management,2,6,Muscular cramp is a common symptom in healthy ...,"Muscle cramps are common in healthy people, es..."
...,...,...,...,...,...,...,...,...,...
914,75,What is a gene affected by sickle cell anemia?,B,31322815,Combined and differential effects of alpha-tha...,3,10,Background: Our objective was to investigate t...,Researchers analyzed the effects of an alpha-t...
915,75,What is a gene affected by sickle cell anemia?,B,32447424,Functional polymorphisms of BCL11A and HBS1L-M...,3,13,Fetal hemoglobin (HbF) ameliorates clinical se...,Fetal hemoglobin (HbF) makes the severity of S...
916,75,What is a gene affected by sickle cell anemia?,B,32772141,"Association between BCL11A, HSB1L-MYB, and Xmn...",3,11,Sickle cell disease (SCD) is a monogenic disea...,Sickle Cell Disease (SCD) is a genetic blood d...
917,75,What is a gene affected by sickle cell anemia?,B,33072979,"Sickle Cell Disease-Genetics, Pathophysiology,...",3,8,Sickle cell disease (SCD) is a monogenetic dis...,Sickle Cell Disease (SCD) is a genetic disorde...


# Comparing & Mapping & Joining - csv vs json

In [196]:
data[~data['pmid'].isin(df_jpara['pmid'])]
# All pmid in csv are in json

Unnamed: 0,question,pmid,input_text,target_text,Adaptation_Version,Question_Type


In [197]:
df_jpara[~df_jpara['pmid'].isin(data['pmid'])]

Unnamed: 0,QID,Question,Question_Type,pmid,Title,adaptation_id,abstract_len,input_paragraph,output_paragraph


### Mapping & Joining - csv & json

In [198]:
data[data['pmid'] == '33347023']

# dup pmid - 2 adaptations
# no uid
# !!!version_id unable to map

Unnamed: 0,question,pmid,input_text,target_text,Adaptation_Version,Question_Type
791,2,33347023,Purpose: Several studies have previously repor...,"Dry eye, depression, and treatent of depressio...",2,B
792,2,33347023,Purpose: Several studies have previously repor...,Studies show that there is a link between depr...,2,B


In [199]:
# csv dup adaptation version

dup_list = [
    #train
    '33160639','32775988','26093176','27600582',

    #test
    '21462113','30377698','24259556','3630857','20200254',
    '12090459','21636864','16372518','24619814','17916952',
    '32631287','19306923','28665969',

    #val
    '33347023', '30249798','34397423','15316838','30838456'
]

In [200]:
dj['2']['33347023']['adaptations']

{'adaptation3': {'1': 'Studies show that there is a link between depression treatment and dry eyes.',
  '2': 'This study looks at the tears of depressed people when given different types of drugs that treat depression.',
  '3': 'This study includes 132 people taking antidepressant drugs and 58 people not taking antidepressant drugs for comparison.',
  '4': 'Patients were taking the antidepressant drugs called venlafaxine, duloxetine, escitalopram, and sertraline.',
  '5': 'Patients filled our a form to measure their level of depression.',
  '6': 'The researchers looked at the eyes and tears of the patients.',
  '7': 'Patients filled out a form to measure the dryness of their eyes.',
  '8': '',
  '9': 'Those people taking antidepressant drugs had drier eyes than those that were not taking antidepressant drugs.',
  '10': 'The people taking duloxetine had the driest eyes.',
  '11': 'The people taking antidepressant drugs scored much higher on a test to measure dry eyes.',
  '12': 'This st

### Json - df_jsent vs df_jpara

In [201]:
df_jpara[df_jpara['pmid'] == '33160639']

Unnamed: 0,QID,Question,Question_Type,pmid,Title,adaptation_id,abstract_len,input_paragraph,output_paragraph
37,3,How can i reduce my potassium levels?,C,33160639,Clinical Management of Hyperkalemia,3,10,Hyperkalemia is an electrolyte abnormality wit...,Hyperkalemia is a condition where the potassiu...
38,3,How can i reduce my potassium levels?,C,33160639,Clinical Management of Hyperkalemia,2,10,Hyperkalemia is an electrolyte abnormality wit...,Hyperkalemia (high blood potassium) is an elec...


In [202]:
df_jsent[df_jsent['pmid'] == '33160639']

Unnamed: 0,QID,Question,Question_Type,pmid,Title,adaptation_id,abstract_len,Abstract_sentence_id,Abstract,Adaptation_sentence_id,Adaptation
342,3,How can i reduce my potassium levels?,C,33160639,Clinical Management of Hyperkalemia,3,10,1,Hyperkalemia is an electrolyte abnormality wit...,1,Hyperkalemia is a condition where the potassiu...
343,3,How can i reduce my potassium levels?,C,33160639,Clinical Management of Hyperkalemia,3,10,2,"Despite various guidelines, no universally acc...",2,"Although there are many guidelines, all doctor..."
344,3,How can i reduce my potassium levels?,C,33160639,Clinical Management of Hyperkalemia,3,10,3,"Based on the available evidence, this review i...",3,This study discusses the serious issues and ne...
345,3,How can i reduce my potassium levels?,C,33160639,Clinical Management of Hyperkalemia,3,10,4,Real-world studies are needed for a better und...,4,Real-world studies are needed to understand ho...
346,3,How can i reduce my potassium levels?,C,33160639,Clinical Management of Hyperkalemia,3,10,5,There is a need to improve effective managemen...,5,Doctors need to improve the overall care of pa...
347,3,How can i reduce my potassium levels?,C,33160639,Clinical Management of Hyperkalemia,3,10,6,Monitoring serum K+ should be individualized; ...,6,How often blood potassium levels are checked d...
348,3,How can i reduce my potassium levels?,C,33160639,Clinical Management of Hyperkalemia,3,10,7,Recent clinical studies suggest that the newer...,7,Recent patient studies suggest that the newer ...
349,3,How can i reduce my potassium levels?,C,33160639,Clinical Management of Hyperkalemia,3,10,8,Enhancing the knowledge of primary care physic...,8,Confidence in caring for patients with hyperka...
350,3,How can i reduce my potassium levels?,C,33160639,Clinical Management of Hyperkalemia,3,10,9,"Lastly, the availability of newer K+-binding a...",9,More studies are required in order to know if ...
351,3,How can i reduce my potassium levels?,C,33160639,Clinical Management of Hyperkalemia,3,10,10,Individualized monitoring of serum K+ among pa...,10,Each patient's blood potassium levels should b...


In [203]:
df_jpara['QID'] = df_jpara['QID'].astype(int)
df_jsent['QID'] = df_jsent['QID'].astype(int)

In [204]:
df_jmerged = df_jsent.merge(df_jpara, on=['QID', 'pmid', 'adaptation_id'], how='left')

In [205]:
df_jmerged

Unnamed: 0,QID,Question_x,Question_Type_x,pmid,Title_x,adaptation_id,abstract_len_x,Abstract_sentence_id,Abstract,Adaptation_sentence_id,Adaptation,Question_y,Question_Type_y,Title_y,abstract_len_y,input_paragraph,output_paragraph
0,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,1,Muscle cramps are a common problem characteriz...,1,Muscle cramps are a common problem represented...,What causes muscle spasm?,C,Muscle cramps,7,Muscle cramps are a common problem characteriz...,Muscle cramps are a common problem represented...
1,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,2,"These true cramps, which originate from periph...",2,"These true cramps, coming from nerves outside ...",What causes muscle spasm?,C,Muscle cramps,7,Muscle cramps are a common problem characteriz...,Muscle cramps are a common problem represented...
2,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,3,"Medical history, physical examination, and a l...",3,"Medical history, physical check-up, and lab sc...",What causes muscle spasm?,C,Muscle cramps,7,Muscle cramps are a common problem characteriz...,Muscle cramps are a common problem represented...
3,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,4,"Despite the ""benign"" nature of cramps, many pa...",4,"Despite their harmless nature, cramps are unco...",What causes muscle spasm?,C,Muscle cramps,7,Muscle cramps are a common problem characteriz...,Muscle cramps are a common problem represented...
4,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,5,Treatment options are guided both by experienc...,5,Experience and limited medical studies guide t...,What causes muscle spasm?,C,Muscle cramps,7,Muscle cramps are a common problem characteriz...,Muscle cramps are a common problem represented...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9313,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,1,Sickle cell anemia (SCA) is a disease characte...,1,Sickle Cell Anemia (SCA) is a genetic blood di...,What is a gene affected by sickle cell anemia?,B,HbF-promoting polymorphisms may specifically r...,5,Sickle cell anemia (SCA) is a disease characte...,Sickle Cell Anemia (SCA) is a genetic blood di...
9314,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,2,Because of their effects on HbS polymerization...,2,Due to their effects on sickle hemoglobin (HbS...,What is a gene affected by sickle cell anemia?,B,HbF-promoting polymorphisms may specifically r...,5,Sickle cell anemia (SCA) is a disease characte...,Sickle Cell Anemia (SCA) is a genetic blood di...
9315,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,3,The aim of our study was to determine if the n...,3,The aim of our study was to find out if the nu...,What is a gene affected by sickle cell anemia?,B,HbF-promoting polymorphisms may specifically r...,5,Sickle cell anemia (SCA) is a disease characte...,Sickle Cell Anemia (SCA) is a genetic blood di...
9316,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,4,Our results confirmed that alpha-thalassemia p...,4,Results showed that alpha-thalassemia protecte...,What is a gene affected by sickle cell anemia?,B,HbF-promoting polymorphisms may specifically r...,5,Sickle cell anemia (SCA) is a disease characte...,Sickle Cell Anemia (SCA) is a genetic blood di...


# Paragraph Segmentation

In [206]:
df_jpara['input_paragraph']

0      Muscle cramps are a common problem characteriz...
1      The dystonias are a group of disorders charact...
2      Muscle cramps result in continuous, involuntar...
3      Exercise-Associated Muscle Cramps (EAMC) are a...
4      Muscular cramp is a common symptom in healthy ...
                             ...                        
914    Background: Our objective was to investigate t...
915    Fetal hemoglobin (HbF) ameliorates clinical se...
916    Sickle cell disease (SCD) is a monogenic disea...
917    Sickle cell disease (SCD) is a monogenetic dis...
918    Sickle cell anemia (SCA) is a disease characte...
Name: input_paragraph, Length: 919, dtype: object

In [None]:
# 'input_paragraph' full text , 'Abstract' sentences

### Rule based

In [None]:
# # Split text based on custom delimiter
# sentences = text.split("|||")
# # !!! No | \t \n ". "

# t0 = df_jpara['input_paragraph'][0]
# s0 = t0.split(". ")
# for i, s in enumerate(s0, 1):
#     print(f"S {i}: {s}")

In [None]:
# import re

# t0 = df_jpara['input_paragraph'][0]
# # Split text into sentences based on punctuation
# s0 = re.split(r'[.!?]', t0)

# for i, s in enumerate(s0, 1):
#   print(f"S {i}: {s}")

S 1: Muscle cramps are a common problem characterized by a sudden, painful, involuntary contraction of muscle
S 2: These true cramps, which originate from peripheral nerves, may be distinguished from other muscle pain or spasm
S 3: Medical history, physical examination, and a limited laboratory screen help to determine the various causes of muscle cramps
S 4: Despite the "benign" nature of cramps, many patients find the symptom very uncomfortable
S 5: Treatment options are guided both by experience and by a limited number of therapeutic trials
S 6: Quinine sulfate is an effective medication, but the side-effect profile is worrisome, and other membrane-stabilizing drugs are probably just as effective
S 7: Patients will benefit from further studies to better define the pathophysiology of muscle cramps and to find more effective medications with fewer side-effects
S 8: 


In [207]:
df_jpara[df_jpara['pmid']=='34408570']
# example with decimal points

Unnamed: 0,QID,Question,Question_Type,pmid,Title,adaptation_id,abstract_len,input_paragraph,output_paragraph
590,45,Do i keep a pressure sore covered and moist an...,C,34408570,Evaluation of the therapeutic efficacy of acti...,1,10,Introduction: Treatment of decubitus ulcers is...,Treatment of bedsores is a serious medical pro...
591,45,Do i keep a pressure sore covered and moist an...,C,34408570,Evaluation of the therapeutic efficacy of acti...,2,10,Introduction: Treatment of decubitus ulcers is...,Treating decubitus ulcers (bedsores) is a seri...


In [208]:
def t2s(text):
    sentence_pattern = r'(?<=[.!?])(?=[^0-9])'
    sentences = re.split(sentence_pattern, text)

    return sentences

t0 = df_jpara['input_paragraph'][590]
s0 = t2s(t0)

for i, s in enumerate(s0, 0):
  print(f"S {i}: {s}")


S 0: Introduction: Treatment of decubitus ulcers is a grave medical problem.
S 1: In many cases, it is difficult to cure a pressure ulcer, especially when it is deep and extensive, and prognosis is usually unfavourable.
S 2: Treatment of decubitus ulcers requires new specialist dressings, which play an important role in the healing process.
S 3: Aim: To evaluate therapeutic efficacy of active specialist medical dressings in the treatment of decubitus.
S 4: Material and methods: Research involved 40 patients - 18 (45%) women and 22 (55%) men, suffering from decubitus ulcers of different size and depth, localized in the sacral region, lasting from 1.5 to 30 months.
S 5: Patients were randomly assigned to two research groups (20 people each), were treated for 4 weeks with 2 different specialist dressings.
S 6: ATRAUMAN Ag, which contains silver ions, was used in the first group, while paraffin gauze of BACTIGRAS type was used in the second group.
S 7: An assessment of pressure ulcers' hea

In [209]:
rs_list = []

for index, row in df_jpara.iterrows():
  abstract = row['input_paragraph']
  sent_abstract = t2s(abstract)

  for i, sentence in enumerate(sent_abstract, 1):
    row = {
        'QID': df_jpara.at[index, 'QID'],
        'Question': df_jpara.at[index, 'Question'],
        'Question_Type': df_jpara.at[index, 'Question_Type'],
        'pmid': df_jpara.at[index, 'pmid'],
        'Title': df_jpara.at[index, 'Title'],
        'adaptation_id': df_jpara.at[index, 'adaptation_id'],
        'abstract_len': df_jpara.at[index, 'abstract_len'],
        'segmented__sentence_id': i,
        'segmented_sentence': sentence
    }
    rs_list.append(row)

df_s_r = pd.DataFrame(rs_list)

In [210]:
df_s_r

Unnamed: 0,QID,Question,Question_Type,pmid,Title,adaptation_id,abstract_len,segmented__sentence_id,segmented_sentence
0,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,1,Muscle cramps are a common problem characteriz...
1,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,2,"These true cramps, which originate from periph..."
2,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,3,"Medical history, physical examination, and a l..."
3,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,4,"Despite the ""benign"" nature of cramps, many pa..."
4,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,5,Treatment options are guided both by experienc...
...,...,...,...,...,...,...,...,...,...
9662,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,1,Sickle cell anemia (SCA) is a disease characte...
9663,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,2,Because of their effects on HbS polymerization...
9664,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,3,The aim of our study was to determine if the n...
9665,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,4,Our results confirmed that alpha-thalassemia p...


### Pre-trained language model - Spacy

In [211]:
t0 = df_jpara['input_paragraph'][0]
s0 = sent_tokenize(t0)

for i, s in enumerate(s0, 1):
  print(f"S {i}: {s}")

# nltk does not work

S 1: Muscle cramps are a common problem characterized by a sudden, painful, involuntary contraction of muscle.These true cramps, which originate from peripheral nerves, may be distinguished from other muscle pain or spasm.Medical history, physical examination, and a limited laboratory screen help to determine the various causes of muscle cramps.Despite the "benign" nature of cramps, many patients find the symptom very uncomfortable.Treatment options are guided both by experience and by a limited number of therapeutic trials.Quinine sulfate is an effective medication, but the side-effect profile is worrisome, and other membrane-stabilizing drugs are probably just as effective.Patients will benefit from further studies to better define the pathophysiology of muscle cramps and to find more effective medications with fewer side-effects.


In [None]:
# nlp = spacy.load('en_core_web_sm')

# test = data['input_text'][0]

# doc = nlp(test)

# sentences = [sent.text for sent in doc.sents]

# for i, sentence in enumerate(sentences, 1):
#     print(f'Sentence {i}: {sentence}')


Sentence 1: Exercise-Associated Muscle Cramps (EAMC) are a common painful condition of muscle spasms.
Sentence 2: Despite scientists tried to understand the physiological mechanism that underlies these common phenomena, the etiology is still unclear.
Sentence 3: From 1900 to nowadays, the scientific world retracted several times the original hypothesis of heat cramps.
Sentence 4: However, recent literature seems to focus on two potential mechanisms: the dehydration or electrolyte depletion mechanism, and the neuromuscular mechanism.
Sentence 5: The aim of this review is to examine the recent literature, in terms of physiological mechanisms of EAMC.
Sentence 6: A comprehensive search was conducted on PubMed and Google Scholar.
Sentence 7: The following terminology was applied: muscle cramps, neuromuscular hypothesis (or thesis), dehydration hypothesis, Exercise-Associated muscle cramps, nocturnal cramps, muscle spasm, muscle fatigue.
Sentence 8: From the initial literature of 424 manusc

In [None]:
# sentences = pd.DataFrame(columns=['pmid','csv_input_text', 'csv_target_text', 'abstract_sentences', 'adaptation_sentences'])

In [None]:
# for index, row in data.iterrows():
#   abstract = row['input_text']
#   doc_abstract = nlp(abstract)

#   adaptation = row['target_text']
#   doc_adaptation = nlp(adaptation)

#   sent_abstract = [sent.text for sent in doc_abstract.sents]
#   sent_adaptation = [sent.text for sent in doc_adaptation.sents]

#   for i, o in zip(sent_abstract, sent_adaptation):
#     new_row = {'pmid': row['pmid'], 'csv_input_text': row['input_text'], 'csv_target_text': row['target_text'],
#                'abstract_sentences': i, 'adaptation_sentences': o }
#     sentences = sentences.append(new_row, ignore_index=True)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  sentences = sentences.append(new_row, ignore_index=True)
  sentences = sentences.append(new_row, ignore_index=True)
  sentences = sentences.append(new_row, ignore_index=True)
  sentences = sentences.append(new_row, ignore_index=True)
  sentences = sentences.append(new_row, ignore_index=True)
  sentences = sentences.append(new_row, ignore_index=True)
  sentences = sentences.append(new_row, ignore_index=True)
  sentences = sentences.append(new_row, ignore_index=True)
  sentences = sentences.append(new_row, ignore_index=True)
  sentences = sentences.append(new_row, ignore_index=True)
  sentences = sentences.append(new_row, ignore_index=True)
  sentences = sentences.append(new_row, ignore_index=True)
  sentences = sentences.append(new_row, ignore_index=True)
  sentences = sentences.append(new_row, ignore_index=True)
  sentences = sentences.append(new_row, ignore_index=True)
  sentences = sentences.append(new_row, ignore_ind

In [None]:
# sentences.to_csv('segmented_sentences_spacy_v2.csv', index=False)
# for Xiaohui's test model - 22.10.17  (not accurate, better use json instead)

In [212]:
nlp = spacy.load('en_core_web_sm')

In [213]:
df_jpara.head(5)

Unnamed: 0,QID,Question,Question_Type,pmid,Title,adaptation_id,abstract_len,input_paragraph,output_paragraph
0,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,Muscle cramps are a common problem characteriz...,Muscle cramps are a common problem represented...
1,1,What causes muscle spasm?,C,25432724,Diagnosis and treatment of dystonia,2,6,The dystonias are a group of disorders charact...,Dystonias are disorders with a lot of uncontro...
2,1,What causes muscle spasm?,C,29763070,Muscle Cramps,2,7,"Muscle cramps result in continuous, involuntar...",Muscle cramps cause constant and unintended co...
3,1,What causes muscle spasm?,C,29857264,Muscle cramps: A comparison of the two-leading...,2,11,Exercise-Associated Muscle Cramps (EAMC) are a...,Exercise-Associated Muscle Cramps (EAMC) are a...
4,1,What causes muscle spasm?,C,30168894,Muscular cramp: causes and management,2,6,Muscular cramp is a common symptom in healthy ...,"Muscle cramps are common in healthy people, es..."


In [214]:
s_list = []

for index, row in df_jpara.iterrows():
  abstract = row['input_paragraph']
  doc_abstract = nlp(abstract)

  sent_abstract = [sent.text for sent in doc_abstract.sents]

  for i, sentence in enumerate(sent_abstract, 1):
    row = {
        'QID': df_jpara.at[index, 'QID'],
        'Question': df_jpara.at[index, 'Question'],
        'Question_Type': df_jpara.at[index, 'Question_Type'],
        'pmid': df_jpara.at[index, 'pmid'],
        'Title': df_jpara.at[index, 'Title'],
        'adaptation_id': df_jpara.at[index, 'adaptation_id'],
        'abstract_len': df_jpara.at[index, 'abstract_len'],
        'segmented__sentence_id': i,
        'segmented_sentence': sentence
    }
    s_list.append(row)

df_s_lm = pd.DataFrame(s_list)

In [215]:
df_s_lm

Unnamed: 0,QID,Question,Question_Type,pmid,Title,adaptation_id,abstract_len,segmented__sentence_id,segmented_sentence
0,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,1,Muscle cramps are a common problem characteriz...
1,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,2,"These true cramps, which originate from periph..."
2,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,3,"Medical history, physical examination, and a l..."
3,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,4,"Despite the ""benign"" nature of cramps, many pa..."
4,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,5,Treatment options are guided both by experienc...
...,...,...,...,...,...,...,...,...,...
8267,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,1,Sickle cell anemia (SCA) is a disease characte...
8268,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,2,Because of their effects on HbS polymerization...
8269,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,3,The aim of our study was to determine if the n...
8270,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,4,Our results confirmed that alpha-thalassemia p...


# Model evaluation

In [None]:
# 1. score for sentences

# 2. text similarity - break by space (match of words) average



### 1. Score
- sentences for each text


In [216]:
df_jmerged['sent_len'] = df_jmerged['Abstract'].apply(lambda x: len(x))

In [217]:
df_jmerged

Unnamed: 0,QID,Question_x,Question_Type_x,pmid,Title_x,adaptation_id,abstract_len_x,Abstract_sentence_id,Abstract,Adaptation_sentence_id,Adaptation,Question_y,Question_Type_y,Title_y,abstract_len_y,input_paragraph,output_paragraph,sent_len
0,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,1,Muscle cramps are a common problem characteriz...,1,Muscle cramps are a common problem represented...,What causes muscle spasm?,C,Muscle cramps,7,Muscle cramps are a common problem characteriz...,Muscle cramps are a common problem represented...,105
1,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,2,"These true cramps, which originate from periph...",2,"These true cramps, coming from nerves outside ...",What causes muscle spasm?,C,Muscle cramps,7,Muscle cramps are a common problem characteriz...,Muscle cramps are a common problem represented...,112
2,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,3,"Medical history, physical examination, and a l...",3,"Medical history, physical check-up, and lab sc...",What causes muscle spasm?,C,Muscle cramps,7,Muscle cramps are a common problem characteriz...,Muscle cramps are a common problem represented...,125
3,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,4,"Despite the ""benign"" nature of cramps, many pa...",4,"Despite their harmless nature, cramps are unco...",What causes muscle spasm?,C,Muscle cramps,7,Muscle cramps are a common problem characteriz...,Muscle cramps are a common problem represented...,89
4,1,What causes muscle spasm?,C,15902691,Muscle cramps,2,7,5,Treatment options are guided both by experienc...,5,Experience and limited medical studies guide t...,What causes muscle spasm?,C,Muscle cramps,7,Muscle cramps are a common problem characteriz...,Muscle cramps are a common problem represented...,94
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9313,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,1,Sickle cell anemia (SCA) is a disease characte...,1,Sickle Cell Anemia (SCA) is a genetic blood di...,What is a gene affected by sickle cell anemia?,B,HbF-promoting polymorphisms may specifically r...,5,Sickle cell anemia (SCA) is a disease characte...,Sickle Cell Anemia (SCA) is a genetic blood di...,88
9314,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,2,Because of their effects on HbS polymerization...,2,Due to their effects on sickle hemoglobin (HbS...,What is a gene affected by sickle cell anemia?,B,HbF-promoting polymorphisms may specifically r...,5,Sickle cell anemia (SCA) is a disease characte...,Sickle Cell Anemia (SCA) is a genetic blood di...,169
9315,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,3,The aim of our study was to determine if the n...,3,The aim of our study was to find out if the nu...,What is a gene affected by sickle cell anemia?,B,HbF-promoting polymorphisms may specifically r...,5,Sickle cell anemia (SCA) is a disease characte...,Sickle Cell Anemia (SCA) is a genetic blood di...,178
9316,75,What is a gene affected by sickle cell anemia?,B,33216016,HbF-promoting polymorphisms may specifically r...,3,5,4,Our results confirmed that alpha-thalassemia p...,4,Results showed that alpha-thalassemia protecte...,What is a gene affected by sickle cell anemia?,B,HbF-promoting polymorphisms may specifically r...,5,Sickle cell anemia (SCA) is a disease characte...,Sickle Cell Anemia (SCA) is a genetic blood di...,148


In [218]:
df_jmerged['sent_len_t2s'] = df_jmerged['Abstract'].apply(lambda x: len(t2s(x)))

In [219]:
def spacy2s(text):
  doc_abstract = nlp(text)
  sentences = [sent.text for sent in doc_abstract.sents]

  return sentences

t0 = df_jpara['input_paragraph'][590]
s0 = spacy2s(t0)

for i, s in enumerate(s0, 1):
  print(f"S {i}: {s}")

S 1: Introduction: Treatment of decubitus ulcers is a grave medical problem.
S 2: In many cases, it is difficult to cure a pressure ulcer, especially when it is deep and extensive, and prognosis is usually unfavourable.
S 3: Treatment of decubitus ulcers requires new specialist dressings, which play an important role in the healing process.
S 4: Aim: To evaluate therapeutic efficacy of active specialist medical dressings in the treatment of decubitus.
S 5: Material and methods: Research involved 40 patients - 18 (45%) women and 22 (55%) men, suffering from decubitus ulcers of different size and depth, localized in the sacral region, lasting from 1.5 to 30 months.
S 6: Patients were randomly assigned to two research groups (20 people each), were treated for 4 weeks with 2 different specialist dressings.
S 7: ATRAUMAN Ag, which contains silver ions, was used in the first group, while paraffin gauze of BACTIGRAS type was used in the second group.
S 8: An assessment of pressure ulcers' hea

In [220]:
df_jmerged['sent_len_t2s'] = df_jmerged['input_paragraph'].apply(lambda x: len(t2s(x)))

In [221]:
df_jmerged['sent_len_spacy2s'] = df_jmerged['input_paragraph'].apply(lambda x: len(spacy2s(x)))

In [222]:
df_jmerged_p = df_jmerged.pivot_table(index = ['QID', 'pmid', 'adaptation_id'],
                                      values=['abstract_len_x', 'sent_len_t2s', 'sent_len_spacy2s'],
                                      aggfunc={'abstract_len_x': 'count', 'sent_len_t2s': 'mean', 'sent_len_spacy2s': 'mean'})
df_jmerged_p

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,abstract_len_x,sent_len_spacy2s,sent_len_t2s
QID,pmid,adaptation_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,15902691,2,7,7,7
1,25432724,2,6,6,6
1,29763070,2,7,7,7
1,29857264,2,11,10,11
1,30168894,2,6,6,6
...,...,...,...,...,...
75,31322815,3,10,6,10
75,32447424,3,13,6,13
75,32772141,3,11,11,11
75,33072979,3,8,8,8


In [223]:
correct_predictions = (df_jmerged_p['abstract_len_x'] == df_jmerged_p['sent_len_t2s']).sum()
accuracy_r = correct_predictions / len(df_jmerged_p)
print("Accuracy:", accuracy_r)

Accuracy: 0.8443960826985855


In [224]:
correct_predictions = (df_jmerged_p['abstract_len_x'] == df_jmerged_p['sent_len_spacy2s']).sum()
accuracy_s = correct_predictions / len(df_jmerged_p)
print("Accuracy:", accuracy_s)

Accuracy: 0.36887921653971706


In [225]:
mse_r = mean_squared_error(df_jmerged_p['abstract_len_x'], df_jmerged_p['sent_len_t2s'])

print("Mean Squared Error:", mse_r)

Mean Squared Error: 2.571273122959739


In [226]:
mse_s = mean_squared_error(df_jmerged_p['abstract_len_x'], df_jmerged_p['sent_len_spacy2s'])

print("Mean Squared Error:", mse_s)

Mean Squared Error: 4.404787812840044


### 2. Text similarity
  - break by space (match of words) average

In [227]:
df_actual = df_jmerged[['QID','pmid','adaptation_id','abstract_len_x','Adaptation_sentence_id','Abstract']]

In [228]:
df_actual

Unnamed: 0,QID,pmid,adaptation_id,abstract_len_x,Adaptation_sentence_id,Abstract
0,1,15902691,2,7,1,Muscle cramps are a common problem characteriz...
1,1,15902691,2,7,2,"These true cramps, which originate from periph..."
2,1,15902691,2,7,3,"Medical history, physical examination, and a l..."
3,1,15902691,2,7,4,"Despite the ""benign"" nature of cramps, many pa..."
4,1,15902691,2,7,5,Treatment options are guided both by experienc...
...,...,...,...,...,...,...
9313,75,33216016,3,5,1,Sickle cell anemia (SCA) is a disease characte...
9314,75,33216016,3,5,2,Because of their effects on HbS polymerization...
9315,75,33216016,3,5,3,The aim of our study was to determine if the n...
9316,75,33216016,3,5,4,Our results confirmed that alpha-thalassemia p...


In [229]:
df_actual.rename(columns={'abstract_len_x': 'abstract_len','Adaptation_sentence_id': 'segmented__sentence_id'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_actual.rename(columns={'abstract_len_x': 'abstract_len','Adaptation_sentence_id': 'segmented__sentence_id'}, inplace=True)


In [230]:
df_rule = df_s_r[['QID','pmid','adaptation_id','abstract_len','segmented__sentence_id','segmented_sentence']]
# rule based

In [231]:
df_rule

Unnamed: 0,QID,pmid,adaptation_id,abstract_len,segmented__sentence_id,segmented_sentence
0,1,15902691,2,7,1,Muscle cramps are a common problem characteriz...
1,1,15902691,2,7,2,"These true cramps, which originate from periph..."
2,1,15902691,2,7,3,"Medical history, physical examination, and a l..."
3,1,15902691,2,7,4,"Despite the ""benign"" nature of cramps, many pa..."
4,1,15902691,2,7,5,Treatment options are guided both by experienc...
...,...,...,...,...,...,...
9662,75,33216016,3,5,1,Sickle cell anemia (SCA) is a disease characte...
9663,75,33216016,3,5,2,Because of their effects on HbS polymerization...
9664,75,33216016,3,5,3,The aim of our study was to determine if the n...
9665,75,33216016,3,5,4,Our results confirmed that alpha-thalassemia p...


In [232]:
df_spacy = df_s_lm[['QID','pmid','adaptation_id','abstract_len','segmented__sentence_id','segmented_sentence']]
# spacy

In [234]:
df_actual['segmented__sentence_id'] = df_actual['segmented__sentence_id'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_actual['segmented__sentence_id'] = df_actual['segmented__sentence_id'].astype(int)


In [235]:
df_sent_comparison = df_actual.merge(df_rule, on=['QID', 'pmid', 'adaptation_id','segmented__sentence_id'], how='left')

In [236]:
df_sent_comparison.rename(columns={'segmented_sentence': 'segmented_sentence_rule'}, inplace=True)

In [237]:
df_sent_comparison = df_sent_comparison.merge(df_spacy, on=['QID', 'pmid', 'adaptation_id','segmented__sentence_id'], how='left')

In [238]:
df_sent_comparison.rename(columns={'segmented_sentence': 'segmented_sentence_spacy'}, inplace=True)

In [239]:
df_sent_comparison

Unnamed: 0,QID,pmid,adaptation_id,abstract_len_x,segmented__sentence_id,Abstract,abstract_len_y,segmented_sentence_rule,abstract_len,segmented_sentence_spacy
0,1,15902691,2,7,1,Muscle cramps are a common problem characteriz...,7.0,Muscle cramps are a common problem characteriz...,7.0,Muscle cramps are a common problem characteriz...
1,1,15902691,2,7,2,"These true cramps, which originate from periph...",7.0,"These true cramps, which originate from periph...",7.0,"These true cramps, which originate from periph..."
2,1,15902691,2,7,3,"Medical history, physical examination, and a l...",7.0,"Medical history, physical examination, and a l...",7.0,"Medical history, physical examination, and a l..."
3,1,15902691,2,7,4,"Despite the ""benign"" nature of cramps, many pa...",7.0,"Despite the ""benign"" nature of cramps, many pa...",7.0,"Despite the ""benign"" nature of cramps, many pa..."
4,1,15902691,2,7,5,Treatment options are guided both by experienc...,7.0,Treatment options are guided both by experienc...,7.0,Treatment options are guided both by experienc...
...,...,...,...,...,...,...,...,...,...,...
9313,75,33216016,3,5,1,Sickle cell anemia (SCA) is a disease characte...,5.0,Sickle cell anemia (SCA) is a disease characte...,5.0,Sickle cell anemia (SCA) is a disease characte...
9314,75,33216016,3,5,2,Because of their effects on HbS polymerization...,5.0,Because of their effects on HbS polymerization...,5.0,Because of their effects on HbS polymerization...
9315,75,33216016,3,5,3,The aim of our study was to determine if the n...,5.0,The aim of our study was to determine if the n...,5.0,The aim of our study was to determine if the n...
9316,75,33216016,3,5,4,Our results confirmed that alpha-thalassemia p...,5.0,Our results confirmed that alpha-thalassemia p...,5.0,Our results confirmed that alpha-thalassemia p...


In [240]:
df_sent_comparison['accuracy_rule'] = df_sent_comparison['Abstract'] == df_sent_comparison['segmented_sentence_rule']

TA = (df_sent_comparison['accuracy_rule'].sum() / len(df_sent_comparison)) * 100

print("Text Accuracy with rule: {:.2f}%".format(TA))

Text Accuracy with rule: 88.81%


In [241]:
df_sent_comparison['accuracy_spacy'] = df_sent_comparison['Abstract'] == df_sent_comparison['segmented_sentence_spacy']

# Calculate accuracy as the percentage of correct predictions
TA = (df_sent_comparison['accuracy_spacy'].sum() / len(df_sent_comparison)) * 100

print("Text Accuracy with spacy: {:.2f}%".format(TA))


Text Accuracy with spacy: 43.97%


In [242]:
df_sent_comparison.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9318 entries, 0 to 9317
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   QID                       9318 non-null   int64  
 1   pmid                      9318 non-null   object 
 2   adaptation_id             9318 non-null   int64  
 3   abstract_len_x            9318 non-null   int64  
 4   segmented__sentence_id    9318 non-null   int64  
 5   Abstract                  9318 non-null   object 
 6   abstract_len_y            9303 non-null   float64
 7   segmented_sentence_rule   9303 non-null   object 
 8   abstract_len              8161 non-null   float64
 9   segmented_sentence_spacy  8161 non-null   object 
 10  accuracy_rule             9318 non-null   bool   
 11  accuracy_spacy            9318 non-null   bool   
dtypes: bool(2), float64(2), int64(4), object(4)
memory usage: 819.0+ KB


In [243]:
df_sent_comparison['Abstract'] = df_sent_comparison['Abstract'].astype(str)
df_sent_comparison['segmented_sentence_rule'] = df_sent_comparison['segmented_sentence_rule'].astype(str)
df_sent_comparison['segmented_sentence_spacy'] = df_sent_comparison['segmented_sentence_spacy'].astype(str)

In [244]:
df_sent_comparison['word_len_act'] = df_sent_comparison['Abstract'].apply(lambda x: len(x))
df_sent_comparison['word_len_rule'] = df_sent_comparison['segmented_sentence_rule'].apply(lambda x: len(x))
df_sent_comparison['word_len_spacy'] = df_sent_comparison['segmented_sentence_spacy'].apply(lambda x: len(x))

In [245]:
mse_rw = mean_squared_error(df_sent_comparison['word_len_act'], df_sent_comparison['word_len_rule'])

print("Mean Squared Error for rule:", mse_rw)

Mean Squared Error for rule: 1556.6333977248337


In [246]:
mse_sw = mean_squared_error(df_sent_comparison['word_len_act'], df_sent_comparison['word_len_spacy'])

print("Mean Squared Error for spacy:", mse_sw)

Mean Squared Error for spacy: 14910.399978536167
