In [1]:
from data import load_papers 
import numpy as np
from bert_score import BERTScorer
from rouge_score import rouge_scorer
from langchain_community.llms import Ollama
import re
separator = "\n---SEPARATOR---\n"

  from .autonotebook import tqdm as notebook_tqdm


# Evaluate summaries

In [2]:
abstracts, texts = load_papers(split='test.txt')

In [3]:
def get_summaries(filename):
    with open(f'summaries/'+filename, 'r') as file:
            summaries = file.read()
    summaries_list = []
    for i in summaries.split(separator):
        summaries_list.append(i)
    return summaries_list

In [4]:
#Load the summaries created in create_summaries.ipynb
#Choices of filenames are following for the simple or the complex prompt and 10 or 100 summaries.
#filename ="write_summary_complex.txt"
#filename = "write_summary_complex100.txt"
filename = "write_summary100.txt"
#filename ="write_summary.txt"
summaries_list = get_summaries(filename)

### Bert scores

In [5]:
# Example texts
reference = "This is a reference text example."
candidate = "This is a candidate text example."
# BERTScore calculation
scorer_bert = BERTScorer(model_type='bert-base-uncased')
P, R, F1 = scorer_bert.score([candidate], [reference])
print(f"BERTScore Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")

### Outputs : BERTScore Precision: 0.9258, Recall: 0.9258, F1: 0.9258

BERTScore Precision: 0.9258, Recall: 0.9258, F1: 0.9258


In [6]:
#Get all bert scores
def get_bert(candidate, reference,filename):
    scores = []
    if len(candidate)<100:
        num = len(candidate)
    else:
        num = 100
    for i in range(num):
        P, R, F1 = scorer_bert.score([candidate[i]], [reference[i]])
        scores.append([round(float(P[0]),4),round(float(R[0]),4),round(float(F1[0]),4)])
        if i%10==0:
            print(i)
    m = np.mean(scores,axis=0)
    s = np.std(scores,axis=0)
    min = np.min(scores,axis=0)
    max = np.max(scores,axis=0)
    scores.append(m)
    scores.append(s) 
    scores.append(min)
    scores.append(max)   
    np.savetxt('results/'+filename+'_bert.txt',np.matrix(scores),fmt='%.4f')
    return np.matrix(scores)
scores = get_bert(summaries_list,abstracts,filename)
#print(scores)
    

0
10
20
30
40
50
60
70
80
90


In [7]:
scores2 = get_bert(summaries_list,texts,filename+'compare_to_text')
print(scores2)

0
10
20
30
40
50
60
70
80
90
[[0.6494     0.5555     0.5988    ]
 [0.4338     0.4683     0.4504    ]
 [0.6656     0.6145     0.639     ]
 [0.6019     0.6115     0.6066    ]
 [0.6415     0.6309     0.6362    ]
 [0.6254     0.5556     0.5885    ]
 [0.6115     0.5923     0.6017    ]
 [0.504      0.4649     0.4837    ]
 [0.5786     0.4993     0.536     ]
 [0.6253     0.6571     0.6408    ]
 [0.6154     0.5374     0.5737    ]
 [0.6649     0.609      0.6358    ]
 [0.6218     0.562      0.5904    ]
 [0.5457     0.5247     0.535     ]
 [0.7395     0.6829     0.7101    ]
 [0.5018     0.4989     0.5004    ]
 [0.6507     0.6401     0.6454    ]
 [0.5334     0.4612     0.4947    ]
 [0.5399     0.5358     0.5379    ]
 [0.6065     0.531      0.5662    ]
 [0.6239     0.577      0.5995    ]
 [0.636      0.5863     0.6101    ]
 [0.565      0.5034     0.5324    ]
 [0.5544     0.5626     0.5585    ]
 [0.574      0.6022     0.5878    ]
 [0.5548     0.544      0.5494    ]
 [0.6035     0.5517     0.5764    ]

In [8]:
# Independent of filename, summaries
scores3 = get_bert(abstracts,texts,filename+'test_original')
print(scores3)

0
10
20
30
40
50
60
70
80
90
[[0.7037     0.6132     0.6553    ]
 [0.7381     0.5363     0.6212    ]
 [0.7686     0.5992     0.6734    ]
 [0.6994     0.5158     0.5937    ]
 [0.5539     0.5665     0.5601    ]
 [0.6456     0.6099     0.6272    ]
 [0.6763     0.6022     0.6371    ]
 [0.6565     0.5413     0.5934    ]
 [0.7078     0.6556     0.6807    ]
 [0.7343     0.5645     0.6383    ]
 [0.6469     0.542      0.5899    ]
 [0.6396     0.5855     0.6113    ]
 [0.6478     0.5826     0.6135    ]
 [0.7837     0.6727     0.724     ]
 [0.5982     0.5237     0.5585    ]
 [0.6659     0.5695     0.614     ]
 [0.7474     0.6165     0.6757    ]
 [0.5989     0.4867     0.537     ]
 [0.6257     0.6088     0.6171    ]
 [0.5646     0.4808     0.5194    ]
 [0.5387     0.535      0.5368    ]
 [0.6935     0.5223     0.5958    ]
 [0.6102     0.5893     0.5996    ]
 [0.5654     0.4939     0.5272    ]
 [0.6054     0.5783     0.5915    ]
 [0.6193     0.5101     0.5594    ]
 [0.6605     0.5901     0.6233    ]

### Bert Miniexmples

In [25]:
#Example how bert score works

# Example texts
reference = "This is a reference text example."
candidate = "This is a candidate text example."
# BERTScore calculation
scorer = BERTScorer(model_type='bert-base-uncased')
P, R, F1 = scorer.score([candidate], [reference])
print(f"BERTScore Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")

### Outputs : BERTScore Precision: 0.9258, Recall: 0.9258, F1: 0.9258

BERTScore Precision: 0.9258, Recall: 0.9258, F1: 0.9258


In [26]:
#How tight are the summaries to the abstract? Which summary is the best score for the first abstract?
for i in range(10):
    P, R, F1 = scorer.score([summaries_list[i]], [abstracts[0]])
    print(f"BERTScore Precision: {P[0]:.4f}, Recall: {R[0]:.4f}, F1: {F1[0]:.4f}")
#Yes the other summaries are less similar than the one dedicated to the text.

BERTScore Precision: 0.6683, Recall: 0.6563, F1: 0.6623
BERTScore Precision: 0.3837, Recall: 0.4395, F1: 0.4097
BERTScore Precision: 0.5076, Recall: 0.5019, F1: 0.5048
BERTScore Precision: 0.4786, Recall: 0.5529, F1: 0.5131
BERTScore Precision: 0.4802, Recall: 0.4910, F1: 0.4855
BERTScore Precision: 0.5054, Recall: 0.5012, F1: 0.5033
BERTScore Precision: 0.4818, Recall: 0.5386, F1: 0.5086
BERTScore Precision: 0.4229, Recall: 0.4188, F1: 0.4209
BERTScore Precision: 0.5127, Recall: 0.4940, F1: 0.5032
BERTScore Precision: 0.4702, Recall: 0.5328, F1: 0.4996


In [27]:
#Remove last sentence
similar_summary = """the short - term periodicities of the daily sunspot area fluctuations from august 1923 to october 1933 are discussed . for these data 
 the correlative analysis indicates negative correlation for the periodicity of about @xmath0 days , but the power spectrum analysis indicates a statistically significant peak in this time interval . 
 a new method of the diagnosis of an echo - effect in spectrum is proposed and it is stated that the 155-day periodicity is a harmonic of the periodicities from the interval of @xmath1 $ ] days .    the autocorrelation functions for the daily sunspot area fluctuations and for the fluctuations of the one rotation time interval in the northern hemisphere , separately for the whole solar cycle 16 and for the maximum activity period of this cycle do not show differences , especially in the interval of @xmath2 $ ] days . 
 it proves against the thesis of the existence of strong positive fluctuations of the about @xmath0-day interval in the maximum activity period of the solar cycle 16 in the northern hemisphere . 
"""
P, R, F1 = scorer.score([similar_summary], [abstracts[0]])
print(f"BERTScore Precision: {P[0]:.4f}, Recall: {R[0]:.4f}, F1: {F1[0]:.4f}")

BERTScore Precision: 0.9918, Recall: 0.9585, F1: 0.9748


In [28]:
# No summary
similar_summary = """This is about."""
P, R, F1 = scorer.score([similar_summary], [abstracts[0]])
print(f"BERTScore Precision: {P[0]:.4f}, Recall: {R[0]:.4f}, F1: {F1[0]:.4f}")

BERTScore Precision: 0.3344, Recall: 0.1858, F1: 0.2389


In [29]:
""" Replace synonyms:
short - term -> short-duration 
discussed -> examined
diagnosis -> evaluation
proves against -> disproves
"""
similar_summary = """ the short-duration periodicities of the daily sunspot area fluctuations from august 1923 to october 1933 are examined. for these data 
 the correlative analysis indicates negative correlation for the periodicity of about @xmath0 days , but the power spectrum analysis indicates a statistically significant peak in this time interval . 
 a new method of the evaluation of an echo - effect in spectrum is proposed and it is stated that the 155-day periodicity is a harmonic of the periodicities from the interval of @xmath1 $ ] days .    the autocorrelation functions for the daily sunspot area fluctuations and for the fluctuations of the one rotation time interval in the northern hemisphere , separately for the whole solar cycle 16 and for the maximum activity period of this cycle do not show differences , especially in the interval of @xmath2 $ ] days . 
 it disproves the thesis of the existence of strong positive fluctuations of the about @xmath0-day interval in the maximum activity period of the solar cycle 16 in the northern hemisphere . 
 however , a similar analysis for data from the southern hemisphere indicates that there is the periodicity of about @xmath0 days in sunspot area data in the maximum activity period of the cycle 16 only . 
"""
P, R, F1 = scorer.score([similar_summary], [abstracts[0]])
print(f"BERTScore Precision: {P[0]:.4f}, Recall: {R[0]:.4f}, F1: {F1[0]:.4f}")
#Very good. With replacing 4 synonms still a good score.

#only remove these words
similar_summary = """ the short periodicities of the daily sunspot area fluctuations from august 1923 to october 1933 are. for these data 
 the correlative analysis indicates negative correlation for the periodicity of about @xmath0 days , but the power spectrum analysis indicates a statistically significant peak in this time interval . 
 a new method of the of an echo - effect in spectrum is proposed and it is stated that the 155-day periodicity is a harmonic of the periodicities from the interval of @xmath1 $ ] days .    the autocorrelation functions for the daily sunspot area fluctuations and for the fluctuations of the one rotation time interval in the northern hemisphere , separately for the whole solar cycle 16 and for the maximum activity period of this cycle do not show differences , especially in the interval of @xmath2 $ ] days . 
 it the thesis of the existence of strong positive fluctuations of the about @xmath0-day interval in the maximum activity period of the solar cycle 16 in the northern hemisphere . 
 however , a similar analysis for data from the southern hemisphere indicates that there is the periodicity of about @xmath0 days in sunspot area data in the maximum activity period of the cycle 16 only . 
"""
P, R, F1 = scorer.score([similar_summary], [abstracts[0]])
print(f"BERTScore Precision: {P[0]:.4f}, Recall: {R[0]:.4f}, F1: {F1[0]:.4f}")
#There the score gots worse.

BERTScore Precision: 0.9850, Recall: 0.9880, F1: 0.9865
BERTScore Precision: 0.9717, Recall: 0.9638, F1: 0.9677


In [30]:
6440+203037+6436 

215913

### Rouge score

In [9]:
#Get all F1 bert scores
scorer_rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
#scores = scorer.score(reference_summary, candidate_summary)

def get_rouge(candidate,reference,filename):
    scores = []
    for i in range(len(candidate)):
        P, R, F1 = scorer_rouge.score(reference[i],candidate[i])['rougeL']
        scores.append([round(float(P),4),round(float(R),4),round(float(F1),4)])

    m = np.mean(scores,axis=0)
    s = np.std(scores,axis=0)
    min = np.min(scores,axis=0)
    max = np.max(scores,axis=0)
    scores.append(m)
    scores.append(s) 
    scores.append(min)
    scores.append(max)   
    np.savetxt('results/'+filename+'_rouge.txt',scores,fmt='%.4f')
    return np.matrix(scores)
scores = get_rouge(summaries_list,abstracts,filename)
print(scores)

[[0.2809     0.2513     0.2653    ]
 [0.0511     0.1538     0.0767    ]
 [0.1101     0.2927     0.16      ]
 [0.0714     0.3919     0.1208    ]
 [0.1449     0.2727     0.1893    ]
 [0.1902     0.2335     0.2097    ]
 [0.1062     0.2244     0.1442    ]
 [0.1102     0.1667     0.1327    ]
 [0.1696     0.2241     0.1931    ]
 [0.1034     0.5652     0.1749    ]
 [0.1071     0.36       0.1651    ]
 [0.1383     0.2063     0.1656    ]
 [0.2461     0.2487     0.2474    ]
 [0.0972     0.2184     0.1345    ]
 [0.1232     0.2877     0.1725    ]
 [0.0788     0.1798     0.1096    ]
 [0.1587     0.3232     0.2129    ]
 [0.0663     0.1912     0.0985    ]
 [0.1166     0.2149     0.1512    ]
 [0.0939     0.2875     0.1415    ]
 [0.1337     0.1862     0.1556    ]
 [0.1394     0.3763     0.2035    ]
 [0.2137     0.122      0.1553    ]
 [0.0952     0.38       0.1523    ]
 [0.1429     0.2371     0.1783    ]
 [0.0988     0.2446     0.1408    ]
 [0.1893     0.2063     0.1974    ]
 [0.0962     0.3415     0.15

In [10]:
scores = get_rouge(summaries_list,texts,filename+'compare_to_text')

In [11]:
scores = get_rouge(abstracts,texts,filename+'test_original')

### Rouge miniexamples

In [15]:
#
#Remove last sentence
similar_summary = """the short - term periodicities of the daily sunspot area fluctuations from august 1923 to october 1933 are discussed . for these data 
 the correlative analysis indicates negative correlation for the periodicity of about @xmath0 days , but the power spectrum analysis indicates a statistically significant peak in this time interval . 
 a new method of the diagnosis of an echo - effect in spectrum is proposed and it is stated that the 155-day periodicity is a harmonic of the periodicities from the interval of @xmath1 $ ] days .    the autocorrelation functions for the daily sunspot area fluctuations and for the fluctuations of the one rotation time interval in the northern hemisphere , separately for the whole solar cycle 16 and for the maximum activity period of this cycle do not show differences , especially in the interval of @xmath2 $ ] days . 
 it proves against the thesis of the existence of strong positive fluctuations of the about @xmath0-day interval in the maximum activity period of the solar cycle 16 in the northern hemisphere . 
"""
P, R, F1 = scorer_rouge.score(abstracts[0], similar_summary)['rougeL']
print(f"RougeScore Precision: {P:.4f}, Recall: {R:.4f}, F1: {F1:.4f}")

RougeScore Precision: 1.0000, Recall: 0.8291, F1: 0.9066


In [19]:
# No summary
similar_summary = """This is about."""
P, R, F1 = scorer_rouge.score(abstracts[0], similar_summary)['rougeL']
print(f"RougeScore Precision: {P:.4f}, Recall: {R:.4f}, F1: {F1:.4f}")
#Good, this is not a summary, good that the f1 value is very low. Precision is 1 because all words of similar_summary are in abstract[0].

RougeScore Precision: 1.0000, Recall: 0.0151, F1: 0.0297


In [20]:
""" Replace synonyms:
short - term -> short-duration 
discussed -> examined
diagnosis -> evaluation
proves against -> disproves
"""
similar_summary = """ the short-duration periodicities of the daily sunspot area fluctuations from august 1923 to october 1933 are examined. for these data 
 the correlative analysis indicates negative correlation for the periodicity of about @xmath0 days , but the power spectrum analysis indicates a statistically significant peak in this time interval . 
 a new method of the evaluation of an echo - effect in spectrum is proposed and it is stated that the 155-day periodicity is a harmonic of the periodicities from the interval of @xmath1 $ ] days .    the autocorrelation functions for the daily sunspot area fluctuations and for the fluctuations of the one rotation time interval in the northern hemisphere , separately for the whole solar cycle 16 and for the maximum activity period of this cycle do not show differences , especially in the interval of @xmath2 $ ] days . 
 it disproves the thesis of the existence of strong positive fluctuations of the about @xmath0-day interval in the maximum activity period of the solar cycle 16 in the northern hemisphere . 
 however , a similar analysis for data from the southern hemisphere indicates that there is the periodicity of about @xmath0 days in sunspot area data in the maximum activity period of the cycle 16 only . 
"""
P, R, F1 = scorer_rouge.score(abstracts[0], similar_summary)['rougeL']
print(f"RougeScore Precision: {P:.4f}, Recall: {R:.4f}, F1: {F1:.4f}")
#With replacing 4 synonms a bit smaller precision and recall

#only remove these words
similar_summary = """ the short periodicities of the daily sunspot area fluctuations from august 1923 to october 1933 are. for these data 
 the correlative analysis indicates negative correlation for the periodicity of about @xmath0 days , but the power spectrum analysis indicates a statistically significant peak in this time interval . 
 a new method of the of an echo - effect in spectrum is proposed and it is stated that the 155-day periodicity is a harmonic of the periodicities from the interval of @xmath1 $ ] days .    the autocorrelation functions for the daily sunspot area fluctuations and for the fluctuations of the one rotation time interval in the northern hemisphere , separately for the whole solar cycle 16 and for the maximum activity period of this cycle do not show differences , especially in the interval of @xmath2 $ ] days . 
 it the thesis of the existence of strong positive fluctuations of the about @xmath0-day interval in the maximum activity period of the solar cycle 16 in the northern hemisphere . 
 however , a similar analysis for data from the southern hemisphere indicates that there is the periodicity of about @xmath0 days in sunspot area data in the maximum activity period of the cycle 16 only . 
"""
P, R, F1 = scorer_rouge.score(abstracts[0], similar_summary)['rougeL']
print(f"RougeScore Precision: {P:.4f}, Recall: {R:.4f}, F1: {F1:.4f}")
#Very bad. Even tough the summary is worse the score is higher. This is because the recall is better. 
#The recall is 1 because all of the words of similar summary are in the abstract[0].
#The precision is both the same because the number of correct words is the same.


RougeScore Precision: 0.9798, Recall: 0.9749, F1: 0.9773
RougeScore Precision: 1.0000, Recall: 0.9749, F1: 0.9873


In [21]:
0.713+0.597+0.428+0.408

2.146

### Compare the summaries from the 2 prompts

In [7]:
filename ="write_summary100.txt"
s1 = get_summaries(filename)

In [9]:
filename ="write_summary_complex100.txt"
s2 = get_summaries(filename)

In [10]:
print(s1[0])
print(s2[0])
print(abstracts[0])
#second is longer and does include more information about the findings and conclusion. First only decribed the methods used.

 The text appears to be discussing the results of an analysis of sunspot data, specifically looking for periodicities in sunspot area fluctuations during the maximum activity period of solar cycle 16. The authors used wavelet analysis and autocorrelation functions to investigate the existence of a periodicity around 155 days. They found that the dispersion of points related to this periodicity was large, making it difficult to confirm its existence. They also compared the autocorrelation functions and periodograms of sunspot area fluctuations from each solar hemisphere separately during the maximum activity period. The results suggested that there might be a statistically significant positive peak in the interval of 155-165 days for the southern hemisphere, but the resolution of the periodogram was not sufficient to make a definitive conclusion. The authors also noted that power spectrum analysis alone may not be sufficient to detect true periodicities and suggested using both correlat

In [11]:
print(s1[1])
print(s2[1])
print(abstracts[1])
#Both times the paper was interpreted as a list of papers. There was no summary.

 It appears that you have provided a list of references related to gravitational physics and astrophysics, specifically those dealing with topics such as black holes, gravitational waves, and cosmology. Here is the information extracted from the given references:  1. Jackiw and Pi (2003) - Phys. Rev. D **68**, 104012 (gr-qc/0308071) 2. Satoh, Kanno, and Soda (2008) - Phys. Rev. D **77**, 023526 (astro-ph/07063585) 3. Contaldi, Magueijo, and Smolin (2008) - Phys. Rev. Lett. **101**, 141101 (astro-ph/08063082) 4. Takahashi and Soda (2009) - Phys. Rev. Lett. **102**, 231301 (hep-th/09040554) 5. Cook and Sorbo (2012) - Phys. Rev. D **85**, 023534; **86**, 069901 6. Obata, Miura, and Soda (2015) - Phys. Rev. D **92**, 063516 (astro-ph/14127620) 7. Lightman et al. (1979) - Problem Book in Relativity and Gravitation, Princeton University Press 8. Maggiore (2008) - Gravitational Waves: Theory and Experiments, Oxford University Press 9. Rybicki and Lightman (1979) - Radiative Processes in Astro

In [12]:
print(s1[2])
print("-")
print(s2[2])
print("--")
print(abstracts[2])
#Both good summaries, catching the idea of the paper, similar to the abstract. Second one attempts to give a formula but fails. 

 This text appears to be a scientific research paper on the topic of nuclear physics, specifically focusing on the calculation of decay half-lives for alpha decays in atomic nuclei and the penetration of potential barriers. The authors have developed a new analytical formula based on the Wentzel-Kramers-Brillouin (WKB) approximation to calculate the barrier penetrability, taking into account the influence of the long-range Coulomb tail in the barrier potential.  The text includes a comparison of the results obtained using different approaches, including the parabolic approximation and the WKB approach, for various isotopes of Po (polonium). The authors have assumed the barrier potential to be spherical for most cases, and they have shown that their new formula gives very good results for these spherical nuclei.  The text also discusses the importance of evaluating the integral in the equation for the penetrability and provides an analytical expression for it. The computation was suppor

In [13]:
print(s1[3])
print("-")
print(s2[3])
print("--")
print(abstracts[3])
#first summary goes too much into details, looses big picture. second as well.

 This text discusses the comparison of various numerical integrators for computing molecular dynamics steps in the Schwinger model using the Hybrid Monte Carlo (HMC) algorithm. The authors consider a 122x122 lattice with coupling constant xmath123 and mass xmath124, taking parameters from [xcite] to simulate near the scaling limit with light fermions and increase the impact of the fermion part of the action. They use one thermalized gauge configuration and generate independent sets of momenta for each integrator and step size. The absolute error and statistical error are computed for each integrator and value of the step size. The micro step size is chosen to be 1/10 times smaller than the macro step size.  Figure [fig:1] compares the numerical integrators in terms of their absolute error versus the step size, showing that multi-rate schemes, RKF45 and RKF85, outperform their standard versions as expected. The adaptive nested force-gradient method has the best accuracy, while figure [f

In [14]:
print(s1[4])
print("-")
print(s2[4])
print("--")
print(abstracts[4])
#Both good summaries. Second focuses more on the methods used.

 This paper discusses methods for deriving functional equations for Feynman integrals, specifically focusing on vacuum type integrals and integrals with external momenta. The authors introduce new methods that are simple and do not rely on integration by parts techniques. They also demonstrate that integrals with many kinematic arguments can be reduced to a combination of simpler integrals with fewer arguments.  The paper presents two methods for deriving functional equations: one based on recurrence relations, and the other using algebraic relations between products of propagators. The authors note that it is not yet clear whether functional equations derived from recurrence relations can be reproduced by the methods of algebraic relations between products of propagators.  The paper includes specific examples of functional equations for vacuum type integrals and integrals with external momenta, and discusses their implications for one-loop integrals with four, five, and six external l

In [15]:
print(s1[5])
print("-")
print(s2[5])
print("--")
print(abstracts[5])
#Both summaries give more information than the abstract. summary 1 gives references as well which is not suited for a summary. 
# Both say that one of the methods some results or mention numbers, but didn't tell  which of the methods is now better, as the abstract does. 
# Summary 2 is closer as summary 1 only mention the result in numbers without conclusion.

 This text appears to be discussing the use of the Hough transform in the search for gravitational waves, specifically in the context of periodic sources. The authors compare the efficiency and effectiveness of using a frequency Hough transform versus a sky Hough transform in terms of amplitude loss and computing cost. They find that the ratio of amplitude efficiencies is 1.317, which leads to a gain in computing cost for the same sensitivity. They also note the importance of adaptivity in the procedure and its applicability to situations where the source position is known but only the frequency needs to be estimated. The authors are also working on studying the efficiency of this method in terms of rejecting spurious lines in the peakmap.  The text includes several references to previous works on the Hough transform search for gravitational waves, including papers by Sintes et al., Astone et al., and Palomba et al. These papers discuss various aspects of the Hough transform search, su

In [16]:
print(s1[6])
print("-")
print(s2[6])
print("--")
print(abstracts[6])
#Identifies correctly that the paper summarizes works


 This text is a scientific review discussing the progress made in identifying the progenitors of certain types of core-collapse supernovae (CCSNe) using pre-explosion images. The author discusses five different types of CCSNe and provides an update on the current understanding of their progenitor stars based on observational data.  Type II-Plateau supernovae (SNe II-P) are the best-studied category, with eight putative progenitor detections and 12 upper luminosity limits established. The evidence suggests that red supergiants (RSGs) are the immediate progenitors of SNe II-P. However, an intriguing result is that all but one of the detected SNe II-P have initial masses constrained to be below 30 solar masses, which is surprising since RSGs up to 32 solar masses are observed in the local group. This lack of massive RSG progenitors leads researchers to speculate that these massive RSG progenitors may be forming black holes heralded by faint or non-existent SN explosions.  Type II-Linear s

In [17]:
print(s1[7])
print("-")
print(s2[7])
print("--")
print(abstracts[7])
#Both interprete the papers as list of references. First one is better as it still gives some content in key words of the paper, while the second only focuses on the references
#

 This text appears to be a list of scientific papers and talks related to spin physics, specifically in the context of various collaborations such as COMPASS, RHIC, GSI-PAX, and others. The papers cover topics such as single-spin asymmetries, Drell-Yan measurements, and transverse momentum dependence. Some of the authors mentioned include Avakian, Alexakhin, Belitsky, Boer, Bacchetta, Efremov, Qiu, and Vogelsang. The papers are published in various journals such as Physics Letters B, Nuclear Physics, and Physical Review D. Some of the talks were presented at workshops and symposiums held in places like Brookhaven National Laboratory, Kyoto, Japan, and Tsukuba, Ibaraki, Japan. The papers span the years from 1982 to 2006, with some works still in preparation.
-
 This text appears to be a list of references for a research paper or report, likely in the field of high energy physics or nuclear physics. The references are cited alphabetically by author last name, with each reference given a 

In [18]:
print(s1[8])
print("-")
print(s2[8])
print("--")
print(abstracts[8])
#Both identify the important things second mentioned the thanks, not relevant in a summary. 
# Both mention rater how the theorem was proven than what it says which would be more interesting.

 This text appears to be a proof of a theorem in statistics, specifically about the Kingman coalescent in the context of the exchangeable coalescent process. The proof uses the rate function and the large deviation principle. The theorem states that the number of particles in the exchangeable coalescent process converges almost surely to a certain random variable as the number of particles goes to infinity. The text also mentions Lemma [l : wn] which is used in the proof but its statement is not provided.  The proof starts by defining some notation and setting up the problem. It then uses the fact that the minimum of a function is attained at its global minimum to prove a certain assertion. The rate function is shown to be of a specific form using integration by parts and the distribution function of the one-dimensional standard Gaussian distribution.  The text then shows that the expected value of a certain term is positive, which is necessary for the proof. This is done by analyzing 

In [19]:
print(s1[9])
print("-")
print(s2[9])
print("--")
print(abstracts[9])
#similar to abstract, same keywords

 This text describes the study of multi-component solitary waves in quasiperiodic quasi-phase matching (QPM) gratings using numerical simulations. The grating function varies according to the Fibonacci sequence, and its Fourier spectrum is composed of sums and differences of the basic wavenumbers, which fill the whole Fourier space densely due to their incommensurability.  The authors analyze the propagation and second harmonic generation (SHG) in the quasiperiodic QPM grating by going beyond the averaged equations to consider the rapid large-amplitude variations of the envelope functions. They find that for weak input beams, both beams eventually diffract, but when the amplitude of the input beam exceeds a certain threshold, self-focusing and localization are observed for both harmonics. The resulting two-component soliton is quasiperiodic by itself and oscillates in phase with the QPM grating modulation.  The authors also investigate the transition between the linear (diffraction) an

In [24]:
for i in range(10):
    print("len summary 1",len(s1[i].split(' ')))
    print("len summary 2",len(s2[i].split(' ')))
    print("compare it to the len of abstract",len(abstracts[i].split(' ')))
#generated summaries are usually longer.

len summary 1 177
len summary 2 247
compare it to the len of abstract 222
len summary 1 274
len summary 2 214
compare it to the len of abstract 104
len summary 1 216
len summary 2 151
compare it to the len of abstract 95
len summary 1 393
len summary 2 428
compare it to the len of abstract 89
len summary 1 206
len summary 2 178
compare it to the len of abstract 155
len summary 1 206
len summary 2 263
compare it to the len of abstract 193
len summary 1 421
len summary 2 278
compare it to the len of abstract 249
len summary 1 116
len summary 2 178
compare it to the len of abstract 94
len summary 1 234
len summary 2 153
compare it to the len of abstract 226
len summary 1 367
len summary 2 260
compare it to the len of abstract 89


In [38]:
print('The summaries generated with the first prompt have in average '+str(np.mean([len(s.split(' ')) for s in s1]))+' words')
print('The summaries generated with the second prompt have in average '+str(np.mean([len(s.split(' ')) for s in s2]))+' words')
print('The abstracts have in average '+str(np.mean([len(s.split(' ')) for s in abstracts]))+' words')

The summaries generated with the first prompt have in average 257.42 words
The summaries generated with the second prompt have in average 271.31 words
The abstracts have in average 172.18 words


In [25]:
summaries_list[0]

' The text appears to be discussing the results of an analysis of sunspot data, specifically looking for periodicities in sunspot area fluctuations during the maximum activity period of solar cycle 16. The authors used wavelet analysis and autocorrelation functions to investigate the existence of a periodicity around 155 days. They found that the dispersion of points related to this periodicity was large, making it difficult to confirm its existence. They also compared the autocorrelation functions and periodograms of sunspot area fluctuations from each solar hemisphere separately during the maximum activity period. The results suggested that there might be a statistically significant positive peak in the interval of 155-165 days for the southern hemisphere, but the resolution of the periodogram was not sufficient to make a definitive conclusion. The authors also noted that power spectrum analysis alone may not be sufficient to detect true periodicities and suggested using both correla

### Self evaluating

In [5]:
llm = Ollama(model = "mistral",temperature=0)


In [6]:
def scorer_llm1(summary,abstract):
    instruction = r""" I have an ideal summary and a candidate summary of a text. Please score the candidate summary on a scale of 0 to 1 based on accuracy, completeness, coherence, and relevance. Note if it is not a summary or unrelated the score should be low, still give a score. Be strict.
                    At the end combine the scores to a Overall Score. 
                    
                    Ideal Summary:
                    """+abstract+"""        

                    Candidate Summary:
                    """+summary+"""

                    Score:
                    """

    answer = llm.invoke(instruction)
    patterns = [
    r'overall score\s*:\s*(0(?:\.\d+)?|1(?:\.0)?)', # Matches 'Overall Score: 0.85'
    r'overallscore\s*:\s*(0(?:\.\d+)?|1(?:\.0)?)', # Matches 'Overall Score: 0.85'
    r'score of (0|0?\.\d+|1\.0) overall',  # Matches 'score of 0.85 overall'
    r'score\s*:\s*(0(?:\.\d+)?|1(?:\.0)?)', # Matches 'Score: 0.95' or similar
    r'score of (0|0?\.\d+|1\.0) ',  # Matches 'score of 0.95 overall'
    r'score[:\s]*(0|0?\.\d+|1\.0)',  
    r'(?<!\d)(0|0?\.\d+|1\.0)(?!\d)' # Matches any number in interwall [0,1]
    ]
    # Initialize the variable to store the first valid score found
    score = None

    # Iterate over the patterns
    for pattern in patterns:
        match = re.search(pattern, answer, re.IGNORECASE)
        if match:
            score = match.group(1)
            break  # Stop after finding the first valid match

    return score,answer

def scorer_llm2(summary,abstract):
    instruction = r""" I have an ideal summary and a candidate summary of a text. Please score the candidate summary on a scale of 0 to 1 based on accuracy, completeness, coherence, and relevance. Note if it is not a summary or unrelated the score should be low, still give a score. Be strict.
                    At the end combine the scores to a Overall Score. Only give the overall score.
                    
                    Ideal Summary:
                    """+abstract+"""        

                    Candidate Summary:
                    """+summary+"""

                    Score:
                    """
    answer = llm.invoke(instruction)
    patterns = [
    r'overall score\s*:\s*(0(?:\.\d+)?|1(?:\.0)?)', # Matches 'Overall Score: 0.85'
    r'overallscore\s*:\s*(0(?:\.\d+)?|1(?:\.0)?)', # Matches 'Overall Score: 0.85'
    r'score of (0|0?\.\d+|1\.0) overall',  # Matches 'score of 0.85 overall'
    r'score\s*:\s*(0(?:\.\d+)?|1(?:\.0)?)', # Matches 'Score: 0.95' or similar
    r'score of (0|0?\.\d+|1\.0) ',  # Matches 'score of 0.95 overall'
    r'score[:\s]*(0|0?\.\d+|1\.0)',  
    r'(?<!\d)(0|0?\.\d+|1\.0)(?!\d)' # Matches any number in interwall [0,1]
    ]
    # Initialize the variable to store the first valid score found
    score = None

    # Iterate over the patterns
    for pattern in patterns:
        match = re.search(pattern, answer, re.IGNORECASE)
        if match:
            score = match.group(1)
            break  # Stop after finding the first valid match
    return score,answer

In [144]:
score, ex = scorer_llm1(summaries_list[0],abstracts[0])
print(ex)
print('---------')
print(score)

2985
629
 Accuracy: 0.95 (The candidate summary accurately captures the main points of the ideal summary, with some minor differences in detail and phrasing.)

Completeness: 0.85 (The candidate summary covers most of the key points from the ideal summary but may be missing some minor details or nuances.)

Coherence: 1.0 (The candidate summary flows logically and makes sense as a whole, with clear connections between ideas.)

Relevance: 1.0 (The candidate summary accurately reflects the content of the original text and remains focused on the topic of sunspot data analysis.)

Overall Score: 0.925 (An average of the four scores above)
---------
0.925


In [7]:
llm_scores = []
for i in range(len(summaries_list)):
    score, _ = scorer_llm1(summaries_list[i],abstracts[i])
    llm_scores.append(float(score))
    if i%10==0:
        print(i)
m = np.mean(llm_scores)
s = np.std(llm_scores)
min = np.min(llm_scores)
max = np.max(llm_scores)
llm_scores.append(m)
llm_scores.append(s) 
llm_scores.append(min)
llm_scores.append(max)
np.savetxt('results/'+filename+'_selfeval1.txt',np.array(llm_scores),fmt='%.4f')

0
10
20
30
40
50
60
70
80
90


In [141]:
llm_scores = []
for i in range(len(summaries_list)):
    score, _ = scorer_llm2(summaries_list[i],abstracts[i])
    llm_scores.append(float(score))
    if i%10==0:
        print(i)
m = np.mean(llm_scores)
s = np.std(llm_scores)
min = np.min(llm_scores)
max = np.max(llm_scores)
llm_scores.append(m)
llm_scores.append(s) 
llm_scores.append(min)
llm_scores.append(max)
np.savetxt('results/'+filename+'_selfeval2.txt',np.array(llm_scores),fmt='%.4f')

0
10
20
30
40
50
60
70
80
90


### Miniexample Self evaluation

In [81]:
#Remove last sentence
similar_summary = """the short - term periodicities of the daily sunspot area fluctuations from august 1923 to october 1933 are discussed . for these data 
 the correlative analysis indicates negative correlation for the periodicity of about @xmath0 days , but the power spectrum analysis indicates a statistically significant peak in this time interval . 
 a new method of the diagnosis of an echo - effect in spectrum is proposed and it is stated that the 155-day periodicity is a harmonic of the periodicities from the interval of @xmath1 $ ] days .    the autocorrelation functions for the daily sunspot area fluctuations and for the fluctuations of the one rotation time interval in the northern hemisphere , separately for the whole solar cycle 16 and for the maximum activity period of this cycle do not show differences , especially in the interval of @xmath2 $ ] days . 
 it proves against the thesis of the existence of strong positive fluctuations of the about @xmath0-day interval in the maximum activity period of the solar cycle 16 in the northern hemisphere . 
"""
S = scorer_llm2(similar_summary, abstracts[0])
print("Selfevaluation score: ", S)

Selfevaluation score:  ('1.0', ' Overall Score: 1.0. The candidate summary is identical to the ideal summary, therefore it receives a perfect score for accuracy, completeness, coherence, and relevance.')


In [84]:
# No summary
similar_summary = """This is about."""
S = scorer_llm2(similar_summary, abstracts[0])
print("Selfevaluation score: ", S)

Selfevaluation score:  ('0', ' Overall Score: 0.0. The candidate summary does not provide any meaningful information and appears to be unrelated to the ideal summary.')


In [85]:
""" Replace synonyms:
short - term -> short-duration 
discussed -> examined
diagnosis -> evaluation
proves against -> disproves
"""
similar_summary = """ the short-duration periodicities of the daily sunspot area fluctuations from august 1923 to october 1933 are examined. for these data 
 the correlative analysis indicates negative correlation for the periodicity of about @xmath0 days , but the power spectrum analysis indicates a statistically significant peak in this time interval . 
 a new method of the evaluation of an echo - effect in spectrum is proposed and it is stated that the 155-day periodicity is a harmonic of the periodicities from the interval of @xmath1 $ ] days .    the autocorrelation functions for the daily sunspot area fluctuations and for the fluctuations of the one rotation time interval in the northern hemisphere , separately for the whole solar cycle 16 and for the maximum activity period of this cycle do not show differences , especially in the interval of @xmath2 $ ] days . 
 it disproves the thesis of the existence of strong positive fluctuations of the about @xmath0-day interval in the maximum activity period of the solar cycle 16 in the northern hemisphere . 
 however , a similar analysis for data from the southern hemisphere indicates that there is the periodicity of about @xmath0 days in sunspot area data in the maximum activity period of the cycle 16 only . 
"""
S = scorer_llm2(similar_summary, abstracts[0])
print("Selfevaluation score: ", S)


#only remove these words
similar_summary = """ the short periodicities of the daily sunspot area fluctuations from august 1923 to october 1933 are. for these data 
 the correlative analysis indicates negative correlation for the periodicity of about @xmath0 days , but the power spectrum analysis indicates a statistically significant peak in this time interval . 
 a new method of the of an echo - effect in spectrum is proposed and it is stated that the 155-day periodicity is a harmonic of the periodicities from the interval of @xmath1 $ ] days .    the autocorrelation functions for the daily sunspot area fluctuations and for the fluctuations of the one rotation time interval in the northern hemisphere , separately for the whole solar cycle 16 and for the maximum activity period of this cycle do not show differences , especially in the interval of @xmath2 $ ] days . 
 it the thesis of the existence of strong positive fluctuations of the about @xmath0-day interval in the maximum activity period of the solar cycle 16 in the northern hemisphere . 
 however , a similar analysis for data from the southern hemisphere indicates that there is the periodicity of about @xmath0 days in sunspot area data in the maximum activity period of the cycle 16 only . 
"""
S = scorer_llm2(similar_summary, abstracts[0])
print("Selfevaluation score: ", S)


Selfevaluation score:  ('1.0', ' Overall Score: 1.0\n\nThe candidate summary accurately and completely captures the main points of the ideal summary, including the negative correlation and statistically significant peak for the periodicity of about @xmath0 days, the proposal of a new method to evaluate an echo effect in spectrum, the lack of differences in autocorrelation functions for daily sunspot area fluctuations and one rotation time interval in the northern hemisphere, and the indication of a periodicity of about @xmath0 days in sunspot area data from the southern hemisphere during the maximum activity period of solar cycle 16. The language used in the candidate summary is also coherent and relevant to the original text.')
Selfevaluation score:  ('1.0', ' Overall Score: 1.0\n\nThe candidate summary is almost identical to the ideal summary, with minor typographical errors and a slight rearrangement of some sentences. The accuracy, completeness, coherence, and relevance are high as

In [87]:
#Prompt 1

In [120]:
#Remove last sentence
similar_summary = """the short - term periodicities of the daily sunspot area fluctuations from august 1923 to october 1933 are discussed . for these data 
 the correlative analysis indicates negative correlation for the periodicity of about @xmath0 days , but the power spectrum analysis indicates a statistically significant peak in this time interval . 
 a new method of the diagnosis of an echo - effect in spectrum is proposed and it is stated that the 155-day periodicity is a harmonic of the periodicities from the interval of @xmath1 $ ] days .    the autocorrelation functions for the daily sunspot area fluctuations and for the fluctuations of the one rotation time interval in the northern hemisphere , separately for the whole solar cycle 16 and for the maximum activity period of this cycle do not show differences , especially in the interval of @xmath2 $ ] days . 
 it proves against the thesis of the existence of strong positive fluctuations of the about @xmath0-day interval in the maximum activity period of the solar cycle 16 in the northern hemisphere . 
"""
S = scorer_llm1(similar_summary, abstracts[0])
print("Selfevaluation score: ", S)

Selfevaluation score:  ('1.0', ' Based on the given ideal summary and candidate summary, I would score the candidate summary as follows:\n\nAccuracy: 1.0 (The candidate summary matches the ideal summary almost exactly.)\nCompleteness: 1.0 (The candidate summary includes all the essential points from the ideal summary.)\nCoherence: 1.0 (The candidate summary is logically consistent and flows well.)\nRelevance: 1.0 (The candidate summary focuses on the main topic of the text, which is the analysis of sunspot area data.)\n\nOverall Score: 1.0 (The candidate summary is an accurate, complete, coherent, and relevant summary of the original text.)')


In [121]:
# No summary
similar_summary = """This is about."""
S = scorer_llm1(similar_summary, abstracts[0])
print("Selfevaluation score: ", S)

Selfevaluation score:  ('0', ' Accuracy: 0, as the candidate summary does not accurately represent the content of the ideal summary.\nCoherence: 0, as there is no logical flow or connection in the candidate summary.\nCompleteness: 0, as the candidate summary does not include any significant details from the ideal summary.\nRelevance: 0, as the candidate summary does not relate to the content of the ideal summary.\n\nOverall Score: 0.')


In [122]:
""" Replace synonyms:
short - term -> short-duration 
discussed -> examined
diagnosis -> evaluation
proves against -> disproves
"""
similar_summary = """ the short-duration periodicities of the daily sunspot area fluctuations from august 1923 to october 1933 are examined. for these data 
 the correlative analysis indicates negative correlation for the periodicity of about @xmath0 days , but the power spectrum analysis indicates a statistically significant peak in this time interval . 
 a new method of the evaluation of an echo - effect in spectrum is proposed and it is stated that the 155-day periodicity is a harmonic of the periodicities from the interval of @xmath1 $ ] days .    the autocorrelation functions for the daily sunspot area fluctuations and for the fluctuations of the one rotation time interval in the northern hemisphere , separately for the whole solar cycle 16 and for the maximum activity period of this cycle do not show differences , especially in the interval of @xmath2 $ ] days . 
 it disproves the thesis of the existence of strong positive fluctuations of the about @xmath0-day interval in the maximum activity period of the solar cycle 16 in the northern hemisphere . 
 however , a similar analysis for data from the southern hemisphere indicates that there is the periodicity of about @xmath0 days in sunspot area data in the maximum activity period of the cycle 16 only . 
"""
S = scorer_llm1(similar_summary, abstracts[0])
print("Selfevaluation score: ", S)


#only remove these words
similar_summary = """ the short periodicities of the daily sunspot area fluctuations from august 1923 to october 1933 are. for these data 
 the correlative analysis indicates negative correlation for the periodicity of about @xmath0 days , but the power spectrum analysis indicates a statistically significant peak in this time interval . 
 a new method of the of an echo - effect in spectrum is proposed and it is stated that the 155-day periodicity is a harmonic of the periodicities from the interval of @xmath1 $ ] days .    the autocorrelation functions for the daily sunspot area fluctuations and for the fluctuations of the one rotation time interval in the northern hemisphere , separately for the whole solar cycle 16 and for the maximum activity period of this cycle do not show differences , especially in the interval of @xmath2 $ ] days . 
 it the thesis of the existence of strong positive fluctuations of the about @xmath0-day interval in the maximum activity period of the solar cycle 16 in the northern hemisphere . 
 however , a similar analysis for data from the southern hemisphere indicates that there is the periodicity of about @xmath0 days in sunspot area data in the maximum activity period of the cycle 16 only . 
"""
S = scorer_llm1(similar_summary, abstracts[0])
print("Selfevaluation score: ", S)

Selfevaluation score:  ('1.0', ' Accuracy: 1 (The candidate summary correctly reports the main findings and conclusions of the ideal summary)\nCompleteness: 1 (The candidate summary covers all the essential points discussed in the ideal summary)\nCoherence: 1 (The candidate summary is logically consistent and flows smoothly)\nRelevance: 1 (The candidate summary accurately reflects the content of the original text)\n\nOverall Score: 1.0 (The candidate summary is an accurate and complete summary of the original text, with high levels of coherence and relevance.)')
Selfevaluation score:  ('1.0', ' Based on the given ideal summary and candidate summary, I would score the candidate summary as follows:\n\nAccuracy: 1.0 (The candidate summary closely matches the ideal summary in terms of facts and information presented.)\n\nCompleteness: 1.0 (The candidate summary covers all the essential points discussed in the ideal summary.)\n\nCoherence: 1.0 (The candidate summary flows logically and is e