In [2]:
import pandas as pd

from scripts.style_generation import get_style_genre
from scripts.first_n_words import get_first_n_words
from scripts.llm import ask_LLM
from scripts.kg_content import extract_kg_content
from scripts.minhash_vector import create_minhash_vector
from scripts.reconstruction_content import extract_reconstruction_content
from scripts.evaluate import evaluate_peformance
import scripts.prompts
import scripts.api_key

In [4]:
# Load the dataset from Hugging Face
dataset = pd.read_csv("dataset/ML-Arxiv-Papers.csv")

# Extract the 'train' split
#train_dataset = dataset["train"]

# Create lists for titles and abstracts
# titles = [entry['title'] for entry in train_dataset]
# abstracts = [entry['abstract'] for entry in train_dataset]

# Create a list with concatenated title and abstract for each sample
concatenated_texts = dataset['abstract'] #[f"{title} {abstract}" for title, abstract in zip(titles, abstracts)]

API_KEY = scripts.api_key.API_KEY


stop_len = 5000

all_kg_results = []
all_reconstruction_results = []
input_string_so_far_list = []


In [6]:
for input_text in concatenated_texts[:5]:

    writing_style = get_style_genre(get_first_n_words(input_text, 1000))

    # sentences= text_to_sentences(input_text)
    # sentences =sentences_to_large_strings(sentences)
    sentences = [input_text]
    # print(sentences)
    # continue
    current_kg = []
    current_kg.append("<style_analysis>" + writing_style + "</style_analysis>")
    print("<style_analysis>" + writing_style + "</style_analysis>")
    segment_nr = 1
    reconstruction_so_far = ""
    input_string_so_far = ""
    for sentence in sentences:
        input_string_so_far += sentence
        if len(input_string_so_far) > stop_len:
            break
        print("INPUT:", sentence)
        print("-----")
        '''
        prompt="""INPUT_TEXT:
        """+sentence+"""
        INSTRUCTION:
        Paraphrase the given input text so that every statement is rephrased into sentences that contain only three to ten words each. Use a simple structure and make sure to retain all information, names, numbers, and dates from the original text, without losing any information. The output text should consist exclusively of factual, neutrally phrased sentences that are three to ten words long. All information must be preserved, but without any artistic nuances. Direct speech in the source text should not be replicated as such, but it should be laid out in short sentences who said or did what in which order, ensuring a neutral, information-rich text."""
  
        reply = ask_LLM ('NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO', "You are a very smart very intelligence assistant who is very helpful.", input_text , API_KEY ,temperature=0.5,top_p=0.95,max_tokens=1000, frequency_penalty=1.1,presence_penalty=1.1)
        '''

        # Determine the slice of the last 50 elements (if the list has more than 50 elements)
        current_kg_context = current_kg[-50:] if len(current_kg) > 50 else current_kg

        # Concatenate the elements into a single string
        current_kg_context = ' '.join(current_kg_context)
        text = scripts.prompts.KG_format_example_prompt(current_kg_context, sentence)

        for i in range(2):
            knowledge_graph_segment = ask_LLM('NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO',
                                              "You are a very smart very intelligence assistant who is very helpful.",
                                              text, API_KEY, temperature=0.5, top_p=0.95, max_tokens=1000,
                                              frequency_penalty=1.1, presence_penalty=1.1)
            if not (extract_kg_content(knowledge_graph_segment) == None):
                break
        try:
            current_kg.append("<segment " + str(segment_nr) + ">\n" + extract_kg_content(
                knowledge_graph_segment) + "<source_sentence_min_hash: " + str(
                create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
            print("<segment " + str(segment_nr) + ">\n" + extract_kg_content(
                knowledge_graph_segment) + "<source_sentence_min_hash: " + str(
                create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
        except:
            current_kg.append(
                "<segment " + str(segment_nr) + ">\n" + knowledge_graph_segment + "<source_sentence_min_hash: " + str(
                    create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
            print("<segment " + str(segment_nr) + ">\n" + knowledge_graph_segment + "<source_sentence_min_hash: " + str(
                create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")

        prompt = scripts.prompts.KG_reconstruction_prompt(reconstruction_so_far, current_kg)
        for i in range(2):
            next_reconstruction = ask_LLM('NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO',
                                          "You are a very smart very intelligence assistant who is very helpful.",
                                          prompt, API_KEY, temperature=0.5, top_p=0.95, max_tokens=4000,
                                          frequency_penalty=1.1, presence_penalty=1.1)
            if not (extract_reconstruction_content(next_reconstruction) == None):
                break

        reconstruction_so_far += extract_reconstruction_content(next_reconstruction)
        print(extract_reconstruction_content(next_reconstruction))
        segment_nr += 1
    all_kg_results.append(current_kg)
    all_reconstruction_results.append(reconstruction_so_far)
    input_string_so_far_list.append(input_string_so_far)

<style_analysis>The given text is situated within the genre of academic writing, specifically in the realm of statistical learning. The format is that of a scholarly article or research paper, with a precise and technical language that is characteristic of such works.
The writing style is formal, academic, and discipline-specific, utilizing complex syntactic structures and rich terminology that may be unfamiliar to readers outside the field. It maintains clarity and simplicity in its exposition, despite the complexity of the subject matter.
In terms of rhythm and flow, the text unfolds through leisurely, intricate phrasing, which is appropriate for the genre and content. The pacing allows for the careful explanation of technical concepts and their interrelationships.
The dominant tone of the text is impartial and authoritative, reflecting the objective nature of the subject matter. The authorial voice is distant, as is typical in academic writing, providing a comprehensive and detailed

In [7]:
df = pd.DataFrame({
    'Input_Texts': input_string_so_far_list,
    'Output_Graphs': all_kg_results,
    'Output_Reconstructions': all_reconstruction_results, })


# print(df)

print("500 word sample evalution:", "\n")
base_cap_500, original_cap_500, knowledgegraph_cap_500, reconstruction_cap_500 = evaluate_peformance(df, 5,
                                                                                                     "q_a_kg.parquet")

print("No context correct answer percentage:", base_cap_500, "\n")
print("Original context correct answer percentage:", original_cap_500, "\n")
print("Knowledgegraph context correct answer percentage:", knowledgegraph_cap_500, "\n")
print("Reconstruckted text context correct answer percentage:", reconstruction_cap_500, "\n")



500 word sample evalution: 

questions, correct_answers  [] []
questions, correct_answers  ['A) Random link failures\nB) Non-random link failures\nC) Perfect link communication\nD) Delayed link communication', 'What is the main factor that determines the probability of error or communication failure in a link?\nA) Signal-to-noise ratio (SNR)\nB) Network topology\nC) Communication budget constraint\nD) Algebraic connectivity', 'Which of the following is NOT a preliminary issue addressed in the text?\nA) Modeling the network as a random topology\nB) Establishing necessary and sufficient conditions for mean square sense (mss) and almost sure (a.s.) convergence of average consensus when network links fail\nC) Showing that a necessary and sufficient condition for both mss and a.s. convergence is for the algebraic connectivity of the mean graph describing the network topology to be strictly positive\nD) Proposing a topology design algorithm that guarantees perfect link communication', 'Which