In [11]:
pip install datasketch

Note: you may need to restart the kernel to use updated packages.


In [12]:
import pandas as pd
from scripts.style_generation import get_style_genre
from scripts.first_n_words import get_first_n_words
from scripts.llm import ask_LLM
from scripts.kg_content import extract_kg_content
from scripts.minhash_vector import create_minhash_vector
from scripts.reconstruction_content import extract_reconstruction_content
from scripts.evaluate import evaluate_peformance
import scripts.prompts
import scripts.api_key


In [13]:
# Load the dataset from Hugging Face
dataset = pd.read_csv("data/ML-Arxiv-Papers.csv")
rows, columns = dataset.shape
# Extract the 'train' split
#train_dataset = dataset["train"]

# Create lists for titles and abstracts
# titles = [entry['title'] for entry in train_dataset]
# abstracts = [entry['abstract'] for entry in train_dataset]

# Create a list with concatenated title and abstract for each sample
concatenated_texts = dataset['abstract'] #[f"{title} {abstract}" for title, abstract in zip(titles, abstracts)]

API_KEY = "bc8ec9e53b89739f63f5e35cc60cfa152f25b24455dbb77e8cb7cf4860bbdfa6"


stop_len = 50000 #5000

model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
#"meta-llama/Llama-3-70b-chat-hf"
#"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"
system_prompt = "You are a very smart very intelligence assistant who is very helpful."

all_kg_results = []
all_reconstruction_results = []
input_string_so_far_list = []



test = ask_LLM(model_name, "You are a very smart very intelligence assistant who is very helpful.", "Tell a joke aout knowledge graphs." , API_KEY, temperature=0.1, top_p=0.95, max_tokens=100,frequency_penalty=1.1, presence_penalty=1.1)

print(test)

Here's one:

Why did the knowledge graph go to therapy?

Because it was struggling to connect the dots!

(get it? connecting the dots is a metaphor for building relationships between entities in a knowledge graph... ahh, I hope that one was enlightening!)


In [14]:
rows

117592

In [28]:
def split_long_text(text, min_words=200, max_words=1000):
    """
    Splits a long text into segments of full sentences.
    
    Args:
    text (str): The input text to be split
    min_words (int): The minimum number of words per segment (default 200)
    max_words (int): The maximum number of words per segment (default 1000)
    
    Returns:
    list: A list of text segments
    """
    # Download necessary NLTK data (run once)
    nltk.download('punkt', quiet=True)
    
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    segments = []
    current_segment = []
    current_word_count = 0
    
    for i, sentence in enumerate(sentences):
        sentence_words = word_tokenize(sentence)
        sentence_word_count = len(sentence_words)
        
        # If a single sentence exceeds max_words, split it
        if sentence_word_count > max_words:
            split_sentence = split_long_sentence(sentence, max_words)
            for sub_sentence in split_sentence:
                sub_word_count = len(word_tokenize(sub_sentence))
                
                if current_word_count + sub_word_count > max_words and current_word_count >= min_words:
                    segments.append(' '.join(current_segment))
                    current_segment = []
                    current_word_count = 0
                
                current_segment.append(sub_sentence)
                current_word_count += sub_word_count
        else:
            if current_word_count + sentence_word_count > max_words and current_word_count >= min_words:
                segments.append(' '.join(current_segment))
                current_segment = []
                current_word_count = 0
            
            current_segment.append(sentence)
            current_word_count += sentence_word_count
        
        # Check if there is a next sentence before accessing it
        if i < len(sentences) - 1:
            next_sentence_word_count = len(word_tokenize(sentences[i + 1]))
            if current_word_count >= min_words and next_sentence_word_count + current_word_count > max_words:
                segments.append(' '.join(current_segment))
                current_segment = []
                current_word_count = 0
    
    # Add any remaining sentences as the last segment
    if current_segment:
        segments.append(' '.join(current_segment))
    
    return segments


In [16]:


def concatenate_to_min_length(strings):
    result = []
    current_string = ""

    for s in strings:
        # If adding the current string doesn't make it 500 characters or more, keep adding
        if len(current_string) + len(s) < 500:
            current_string += s
        else:
            # If the current string is long enough, add it to the result and start a new string
            if current_string:
                result.append(current_string)
            current_string = s

        # If the current string is 500 characters or more, add it to the result and reset
        if len(current_string) >= 500:
            result.append(current_string)
            current_string = ""

    # Add the last string if it exists and isn't empty
    if current_string:
        result.append(current_string)

    return result

# Function to construct knowledge graphs and reconstruct text
def KG_construction_and_reconstruction(input_text, model, preprocessing_type="list_of_facts"):
    """
    This function takes an input text and a model, and performs the following steps:
    1. Determines the writing style of the input text
    2. Constructs a knowledge graph from the input text
    3. Reconstructs the text based on the knowledge graph
    
    Args:
    input_text (str): The input text to process
    model (str): The name of the language model to use

    Returns:
    tuple: Contains the input text, constructed knowledge graph, and reconstructed text
    """
    # Determine the writing style of the input text
    writing_style = get_style_genre(model_name, API_KEY, system_prompt ,get_first_n_words(  input_text, len(input_text) ) )

    if preprocessing_type=="paraphrasation":
      paraphrasation_list=[]
      chunks= split_long_text(input_text)
      for chunk in chunks:
        paraphrasation = ask_LLM(model_name, system_prompt, "Paraphrase the following text so that the resulting text consists entirely of short sentences of with less than 15 words per sentence and with simple grammar, while still using all important facts, contents, names, pharses, nouns and adjectives. BE VERY FACTUAL WHEN DOING THIS. The paraphrased text should still contain the same information (facts, nouns, verbs, adjectives, meaning, dates, numbers, names, places, variables, ...) as the INPUT TEXT, but with clean & easily understandable grammer. If there are broken tables, broken citations, gibberish or spam in the INPUT TEXT, just leave them out. Each sentence in PARAPHRASATION should end with either a dot (.) , a question mark (?) or an exklamation mark (!) and consist only of sentences that are shorter than 15 word. Do only output the PARAPHRASATION, nothing else. Even if the sentences are short, they should be very intelligent and very factual. INPUT TEXT:" +chunk + "\nPARAPHRASATION: ", API_KEY, temperature=0.5, top_p=0.95, max_tokens=1000,
                                                    frequency_penalty=1.0, presence_penalty=1.0)
        paraphrasation_list.append(paraphrasation)
        print(paraphrasation)
        
      # Initialize variables for knowledge graph construction
      sentences = sent_tokenize(" ".join(paraphrasation_list) )
      sentence_chunks= concatenate_to_min_length(sentences)
    
      print(sentence_chunks)
      print(len(sentence_chunks))

    if preprocessing_type=="list_of_facts":
      lists_of_facts_list=[]
      chunks= split_long_text(input_text)
      for chunk in chunks:
        paraphrasation = ask_LLM(model_name, system_prompt, "Write for the following text a list of ALL facts, events, actions, thoughts, ideas, quotes, ... in as VERY FACTUAL MANNER, that presents them on the point, so that it would be an ideal learning script to prepare for a question - answer exam about this text, without unnecessary fluff. Keep preserving all important facts, contents, names, pharses, nouns and adjectives. BE VERY FACTUAL WHEN DOING THIS. The resulting text should still contain the same information (facts, nouns, verbs, adjectives, meaning, dates, numbers, names, places, variables, ...) as the INPUT TEXT, but with clean & easily understandable grammer. If there are broken tables, broken citations, gibberish or spam in the INPUT TEXT, just leave them out. Each sentence in FACTS should end with either a dot (.) , a question mark (?) or an exklamation mark (!) and consist only of sentences that are shorter than 15 word. Do only output the FACTS, nothing else. Even if the sentences are short, they should be very intelligent and very factual. INPUT TEXT:" +chunk + "\nFACTS: ", API_KEY, temperature=0.5, top_p=0.95, max_tokens=1000,
                                                    frequency_penalty=1.0, presence_penalty=1.0)
        lists_of_facts_list.append(paraphrasation)
        print(paraphrasation)
        
      # Initialize variables for knowledge graph construction
      sentences = sent_tokenize(" ".join(lists_of_facts_list) )
      sentence_chunks= concatenate_to_min_length(sentences)
    
      print(sentence_chunks)
      print(len(sentence_chunks))


    current_kg = []
    current_kg.append("<style_analysis>" + writing_style + "</style_analysis>")
    segment_nr = 1
    reconstruction_so_far = ""
    input_string_so_far = ""
    print("<style_analysis>"+ writing_style + "</style_analysis>")
    # Process each sentence in the input text
    for sentence in sentence_chunks:
        
        input_string_so_far += sentence
        if len(input_string_so_far) > stop_len:
            break
        
        # Get the context for the current knowledge graph segment
        current_kg_context = current_kg[-50:] if len(current_kg) > 50 else current_kg
        current_kg_context = ' '.join(current_kg_context)

        # Generate the prompt for knowledge graph construction
        text = scripts.prompts.KG_format_example_prompt(current_kg_context, sentence)
        #print("Prompt:", text)

        
        # Generate the knowledge graph segment
        try:
            for i in range(2):
                knowledge_graph_segment = ask_LLM(model_name, system_prompt, text, API_KEY, temperature=0.5, top_p=0.95, max_tokens=1000,
                                                    frequency_penalty=1.1, presence_penalty=1.1)
                print("Constructed KG:")
                if not (extract_kg_content(knowledge_graph_segment) == None):
                    break
            
            # Add the knowledge graph segment to the current knowledge graph
            try:
                current_kg.append("<segment " + str(segment_nr) + ">\n" + extract_kg_content(
                        knowledge_graph_segment) + "<source_sentence_min_hash: " + str(
                        create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
                print("<segment " + str(segment_nr) + ">\n" + extract_kg_content(
                        knowledge_graph_segment) + "<source_sentence_min_hash: " + str(
                        create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
            except:
                current_kg.append(
                        "<segment " + str(segment_nr) + ">\n" + knowledge_graph_segment + "<source_sentence_min_hash: " + str(
                            create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
                
                print("<segment " + str(segment_nr) + ">\n" + knowledge_graph_segment + "<source_sentence_min_hash: " + str(
                        create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")

            # Generate the prompt for text reconstruction
            prompt = scripts.prompts.KG_reconstruction_prompt(reconstruction_so_far, current_kg)
            
            # Reconstruct the text based on the knowledge graph
            for i in range(2):
                next_reconstruction = ask_LLM(model_name,
                                                "You are a very smart very intelligence assistant who is very helpful.",
                                                prompt, API_KEY, temperature=0.5, top_p=0.95, max_tokens=4000,
                                                frequency_penalty=1.1, presence_penalty=1.1)
                if not (extract_reconstruction_content(next_reconstruction) == None):
                    break

            reconstruction_so_far += extract_reconstruction_content(next_reconstruction)
            print("Recontructed Segment:", extract_reconstruction_content(next_reconstruction))
            segment_nr += 1

        except:
            print("No Kg found")
            
    return input_string_so_far, current_kg, reconstruction_so_far


In [17]:
def flatten_list_of_lists(list_of_lists):
    flattened_list = [item for sublist in list_of_lists for item in sublist]
    return ''.join(flattened_list)  


In [18]:
input_string_so_far_list = []
all_kg_results = []
all_reconstruction_results = []

min_segment_length = 500  # in words

max_segment_length = 1000

count = 0
for input_text in concatenated_texts[0:1]:
    print(count)
    
    count = count+1
    print(len(input_text))
    split_text = split_long_text(input_text, min_segment_length, max_segment_length)
    
    #print(split_text[1])
    input_split_texts = []
    kg_for_input_split_texts = []
    KGs_so_far=""
    reconstruction_results_for_input_split_texts = []
    print(len(split_text) )
    
    for each_split_text in split_text[0:len(split_text)]:
        
        #print("Input segment:" ,each_split_text)

        input_string_list, kg_results, reconstruction_results = KG_construction_and_reconstruction(each_split_text, model_name)
        input_split_texts.append(input_string_list)
        kg_for_input_split_texts.append(kg_results)
        KGs_so_far= "\n".join(kg_for_input_split_texts)
        reconstruction_results_for_input_split_texts.append(reconstruction_results)
        
    input_string_so_far_list.append(flatten_list_of_lists(input_split_texts))
    all_kg_results.append(flatten_list_of_lists(kg_for_input_split_texts))
    all_reconstruction_results.append(flatten_list_of_lists(reconstruction_results_for_input_split_texts))
    
    #for i in range(0,len(split_text),1):
     #   input_string_so_far_list, all_kg_results, all_reconstruction_results = KG_construction_and_reconstruction(split_text[i], model_name)
    
    
    
    
#     if len(input_text)>1000:
#         pass
#     else:
#         input_string_so_far_list, all_kg_results, all_reconstruction_results = KG_construction_and_reconstruction(input_text, model_name)
            
        


0
985
1
Here are the facts extracted from the input text:

The problem is to construct a predictor of a random variable $Y$ as a function of $X$.
The predictor is constructed from an i.i.d. training sample from the joint distribution of $(X,Y)$.
Allowable predictors are drawn from a specified class.
The goal is to approach asymptotically the performance of the best predictor in the class.
Perfect observation of the $X$-part of the sample is assumed.
The $Y$-part of the sample is communicated at a finite bit rate.
The encoding of the $Y$-values depends on the $X$-values.
Regular conditions on the admissible predictors are assumed.
Regular conditions on the underlying family of probability distributions are assumed.
Regular conditions on the loss function are assumed.
An information-theoretic characterization of achievable predictor performance is given.
The characterization is in terms of conditional distortion-rate functions.
The underlying family of probability distributions is a key 

TypeError: sequence item 0: expected str instance, list found

In [30]:
input_string_so_far_list = []
all_kg_results = []
all_reconstruction_results = []

# Reading the entire contents of bible.txt into a single string
with open("""C:/Users/dlisg/Documents/BUD-E/bible.txt""", 'r', encoding='utf-8') as file:
    bible_text = file.read()

# Now, bible_text contains the entire contents of the file as a single string
print(bible_text[:1000])  # Print the first 1000 characters to verify



min_segment_length = 500  # in words

max_segment_length = 1000


count = 0
for input_text in [bible_text[:10000]]:
    print(count)
    
    count = count+1
    print(len(input_text))
    split_text = split_long_text(input_text, min_segment_length,max_segment_length)
    
    #print(split_text[1])
    input_split_texts = []
    kg_for_input_split_texts = []
    KGs_so_far=""
    reconstruction_results_for_input_split_texts = []
    
    for each_split_text in split_text[0:len(split_text)]:
        
        #print("Input segment:" ,each_split_text)

        input_string_list, kg_results, reconstruction_results = KG_construction_and_reconstruction(each_split_text, model_name)
        input_split_texts.append(input_string_list)
        kg_for_input_split_texts.append(kg_results)
        KGs_so_far= "\n".join(kg_for_input_split_texts)
        reconstruction_results_for_input_split_texts.append(reconstruction_results)
        
    input_string_so_far_list.append(flatten_list_of_lists(input_split_texts))
    all_kg_results.append(flatten_list_of_lists(kg_for_input_split_texts))
    all_reconstruction_results.append(flatten_list_of_lists(reconstruction_results_for_input_split_texts))
    

﻿ERV
English Revised Version
Genesis 1:1	In the beginning God created the heaven and the earth.
Genesis 1:2	And the earth was waste and void; and darkness was upon the face of the deep: and the spirit of God moved upon the face of the waters.
Genesis 1:3	And God said, Let there be light: and there was light.
Genesis 1:4	And God saw the light, that it was good: and God divided the light from the darkness.
Genesis 1:5	And God called the light Day, and the darkness he called Night. And there was evening and there was morning, one day.
Genesis 1:6	And God said, Let there be a firmament in the midst of the waters, and let it divide the waters from the waters.
Genesis 1:7	And God made the firmament, and divided the waters which were under the firmament from the waters which were above the firmament: and it was so.
Genesis 1:8	And God called the firmament Heaven. And there was evening and there was morning, a second day.
Genesis 1:9	And God said, Let the waters under the heaven be gathered to

TypeError: sequence item 0: expected str instance, list found

In [None]:
df = pd.DataFrame({
    'Input_Texts': input_string_so_far_list,
    'Output_Graphs': all_kg_results,
    'Output_Reconstructions': all_reconstruction_results, })


print(df)

# print("500 word sample evalution:", "\n")
# base_cap_500, original_cap_500, knowledgegraph_cap_500, reconstruction_cap_500,QA_df = evaluate_peformance(df, 2,
#                                                                                                      "q_a_kg.parquet")

# print("No context correct answer percentage:", base_cap_500, "\n")
# print("Original context correct answer percentage:", original_cap_500, "\n")
# print("Knowledgegraph context correct answer percentage:", knowledgegraph_cap_500, "\n")
# print("Reconstruckted text context correct answer percentage:", reconstruction_cap_500, "\n")





In [None]:
df.to_csv("data/test.csv", encoding='utf-8', index=False)
#QA_df.to_csv("data/questions_answer_save_200.csv", encoding='utf-8', index=False)

In [None]:
# i = 0
# for input_text in concatenated_texts[2451:2500]:
#     print(i)
    
#     i = i+1
#     print(len(input_text))
    
#     #writing_style = get_style_genre(model_name, system_prompt, get_first_n_words(input_text, len(input_text))) #len(input_text) 1000
#     writing_style = get_style_genre(get_first_n_words(input_text, len(input_text)), model_name, system_prompt) #len(input_text) 1000
    
#     sentences = [input_text]
#     current_kg = []
#     current_kg.append("<style_analysis>" + writing_style + "</style_analysis>")
#     segment_nr = 1
#     reconstruction_so_far = ""
#     input_string_so_far = ""
#     for sentence in sentences:
#         input_string_so_far += sentence
#         if len(input_string_so_far) > stop_len:
#             break
#         current_kg_context = current_kg[-50:] if len(current_kg) > 50 else current_kg

#             # Concatenate the elements into a single string
#         current_kg_context = ' '.join(current_kg_context)

#         text = scripts.prompts.KG_format_example_prompt(current_kg_context, sentence)
        
        
#         try:
#             for i in range(2):
#                 knowledge_graph_segment = ask_LLM(model_name,
#                                                     "You are a very smart very intelligence assistant who is very helpful.",
#                                                     text, API_KEY, temperature=0.5, top_p=0.95, max_tokens=1000,
#                                                     frequency_penalty=1.1, presence_penalty=1.1)
#                 if not (extract_kg_content(knowledge_graph_segment) == None):
#                     break
#             try:
#                 current_kg.append("<segment " + str(segment_nr) + ">\n" + extract_kg_content(
#                         knowledge_graph_segment) + "<source_sentence_min_hash: " + str(
#                         create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
#                 print("<segment " + str(segment_nr) + ">\n" + extract_kg_content(
#                         knowledge_graph_segment) + "<source_sentence_min_hash: " + str(
#                         create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
#             except:
#                 current_kg.append(
#                         "<segment " + str(segment_nr) + ">\n" + knowledge_graph_segment + "<source_sentence_min_hash: " + str(
#                             create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
#                 print("<segment " + str(segment_nr) + ">\n" + knowledge_graph_segment + "<source_sentence_min_hash: " + str(
#                         create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")

#             prompt = scripts.prompts.KG_reconstruction_prompt(reconstruction_so_far, current_kg)
#             for i in range(2):
#                 next_reconstruction = ask_LLM(model_name,
#                                                 "You are a very smart very intelligence assistant who is very helpful.",
#                                                 prompt, API_KEY, temperature=0.5, top_p=0.95, max_tokens=4000,
#                                                 frequency_penalty=1.1, presence_penalty=1.1)
#                 if not (extract_reconstruction_content(next_reconstruction) == None):
#                     break

#             reconstruction_so_far += extract_reconstruction_content(next_reconstruction)
#                 #print(reconstruction_so_far)

#             print(extract_reconstruction_content(next_reconstruction))
#             segment_nr += 1

#             #all_kg_results.append(current_kg)
#             #print("....................start....................................")
#             #print(current_kg.split("<source_sentence_min_hash:"))

#             #print("...............current kg........................")
#             #print(current_kg)

#             #kg_String = ''.join(current_kg)
        
#         except:
#             print("No Kg found")
            
#         try:
#             all_kg_results.append(current_kg)

#                 #print(".....................current kg end.........................")
#                 #all_reconstruction_results.append(reconstruction_so_far)
#                 #print(reconstruction_so_far.split("<source_sentence_min_hash:")[0])
#                 #reconstruction_String = ''.join(reconstruction_so_far)
#             all_reconstruction_results.append(reconstruction_so_far)
#                 #print("....................end....................................")

#             input_string_so_far_list.append(input_string_so_far)
        
#         except:
#             print("Pass because of no Kg found")


In [None]:
# i = 0

# for input_text in concatenated_texts[101:200]:
#     try:
#         print(i)

#         i = i+1
#         print(len(input_text))
#         writing_style = get_style_genre(get_first_n_words(input_text, len(input_text))) #len(input_text) 1000
#         sentences = [input_text]
#         current_kg = []
#         current_kg.append("<style_analysis>" + writing_style + "</style_analysis>")
#         segment_nr = 1
#         reconstruction_so_far = ""
#         input_string_so_far = ""
#         for sentence in sentences:
#             input_string_so_far += sentence
#             if len(input_string_so_far) > stop_len:
#                 break
#             current_kg_context = current_kg[-50:] if len(current_kg) > 50 else current_kg

#                 # Concatenate the elements into a single string
#             current_kg_context = ' '.join(current_kg_context)

#             text = scripts.prompts.KG_format_example_prompt(current_kg_context, sentence)

#             for i in range(2):
#                 knowledge_graph_segment = ask_LLM(model_name,
#                                                     "You are a very smart very intelligence assistant who is very helpful.",
#                                                     text, API_KEY, temperature=0.1, top_p=0.95, max_tokens=1000,
#                                                     frequency_penalty=1.1, presence_penalty=1.1)
#                 if not (extract_kg_content(knowledge_graph_segment) == None):
#                     break
#             try:
#                 current_kg.append("<segment " + str(segment_nr) + ">\n" + extract_kg_content(
#                         knowledge_graph_segment) + "<source_sentence_min_hash: " + str(
#                         create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
#                 print("<segment " + str(segment_nr) + ">\n" + extract_kg_content(
#                         knowledge_graph_segment) + "<source_sentence_min_hash: " + str(
#                         create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
#             except:
#                 current_kg.append(
#                         "<segment " + str(segment_nr) + ">\n" + knowledge_graph_segment + "<source_sentence_min_hash: " + str(
#                             create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
#                 print("<segment " + str(segment_nr) + ">\n" + knowledge_graph_segment + "<source_sentence_min_hash: " + str(
#                         create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")

#             prompt = scripts.prompts.KG_reconstruction_prompt(reconstruction_so_far, current_kg)
#             for i in range(2):
#                 next_reconstruction = ask_LLM(model_name,
#                                                 "You are a very smart very intelligence assistant who is very helpful.",
#                                                 prompt, API_KEY, temperature=0.5, top_p=0.95, max_tokens=4000,
#                                                 frequency_penalty=1.1, presence_penalty=1.1)
#                 if not (extract_reconstruction_content(next_reconstruction) == None):
#                     break

#             reconstruction_so_far += extract_reconstruction_content(next_reconstruction)
#                 #print(reconstruction_so_far)

#             print(extract_reconstruction_content(next_reconstruction))
#             segment_nr += 1

#             #all_kg_results.append(current_kg)
#             #print("....................start....................................")
#             #print(current_kg.split("<source_sentence_min_hash:"))

#             #print("...............current kg........................")
#             #print(current_kg)

#             #kg_String = ''.join(current_kg)
#         all_kg_results.append(current_kg)

#             #print(".....................current kg end.........................")
#             #all_reconstruction_results.append(reconstruction_so_far)
#             #print(reconstruction_so_far.split("<source_sentence_min_hash:")[0])
#             #reconstruction_String = ''.join(reconstruction_so_far)
#         all_reconstruction_results.append(reconstruction_so_far)
#             #print("....................end....................................")

#         input_string_so_far_list.append(input_string_so_far)

#     except:
#         pass


In [None]:
# i = 0
# for input_text in concatenated_texts[0:1000]:
#     print(i)
    
#     i = i+1
#     print(len(input_text))
#     #print(input_text)
#     try:

#         writing_style = get_style_genre(get_first_n_words(input_text, len(input_text))) #len(input_text) 1000

#         # sentences= text_to_sentences(input_text)
#         # sentences =sentences_to_large_strings(sentences)
#         sentences = [input_text]
#         # print(sentences)
#         # continue
#         current_kg = []
#         current_kg.append("<style_analysis>" + writing_style + "</style_analysis>")
#         #print("<style_analysis>" + writing_style + "</style_analysis>")
#         segment_nr = 1
#         reconstruction_so_far = ""
#         input_string_so_far = ""
#         for sentence in sentences:
#             input_string_so_far += sentence
#             if len(input_string_so_far) > stop_len:
#                 break
#             #print("INPUT:", sentence)
#             # print("-----")
#             # '''
#             # prompt="""INPUT_TEXT:
#             # """+sentence+"""
#             # INSTRUCTION:
#             # Paraphrase the given input text so that every statement is rephrased into sentences that contain only three to ten words each.
#             #   Use a simple structure and make sure to retain all information, names, numbers, and dates from the original text, without losing
#             #     any information. The output text should consist exclusively of factual, neutrally phrased sentences that are three to ten words
#             #       long. All information must be preserved, but without any artistic nuances. Direct speech in the source text should not be
#             #         replicated as such, but it should be laid out in short sentences who said or did what in which order, ensuring a neutral,
#             #           information-rich text."""
    
#             # reply = ask_LLM ('NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO',
#             #   "You are a very smart very intelligence assistant who is very helpful.",
#             #     input_text , API_KEY ,temperature=0.5,top_p=0.95,max_tokens=1000, frequency_penalty=1.1,presence_penalty=1.1)
#             # '''

#             # Determine the slice of the last 50 elements (if the list has more than 50 elements)
#             current_kg_context = current_kg[-50:] if len(current_kg) > 50 else current_kg

#             # Concatenate the elements into a single string
#             current_kg_context = ' '.join(current_kg_context)

#             #print(".....................KG_format_example_prompt start.......................")
#             text = scripts.prompts.KG_format_example_prompt(current_kg_context, sentence)
#             #print(text)
#             #print(".....................KG_format_example_prompt end.......................")

#             for i in range(2):
#                 knowledge_graph_segment = ask_LLM(model_name,
#                                                 "You are a very smart very intelligence assistant who is very helpful.",
#                                                 text, API_KEY, temperature=0.1, top_p=0.95, max_tokens=1000,
#                                                 frequency_penalty=1.1, presence_penalty=1.1)
#                 if not (extract_kg_content(knowledge_graph_segment) == None):
#                     break
#             try:
#                 current_kg.append("<segment " + str(segment_nr) + ">\n" + extract_kg_content(
#                     knowledge_graph_segment) + "<source_sentence_min_hash: " + str(
#                     create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
#                 print("<segment " + str(segment_nr) + ">\n" + extract_kg_content(
#                     knowledge_graph_segment) + "<source_sentence_min_hash: " + str(
#                     create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
#             except:
#                 current_kg.append(
#                     "<segment " + str(segment_nr) + ">\n" + knowledge_graph_segment + "<source_sentence_min_hash: " + str(
#                         create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
#                 print("<segment " + str(segment_nr) + ">\n" + knowledge_graph_segment + "<source_sentence_min_hash: " + str(
#                     create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")

#             prompt = scripts.prompts.KG_reconstruction_prompt(reconstruction_so_far, current_kg)
#             for i in range(2):
#                 next_reconstruction = ask_LLM(model_name,
#                                             "You are a very smart very intelligence assistant who is very helpful.",
#                                             prompt, API_KEY, temperature=0.5, top_p=0.95, max_tokens=4000,
#                                             frequency_penalty=1.1, presence_penalty=1.1)
#                 if not (extract_reconstruction_content(next_reconstruction) == None):
#                     break

#             reconstruction_so_far += extract_reconstruction_content(next_reconstruction)
#             #print(reconstruction_so_far)
            
#             print(extract_reconstruction_content(next_reconstruction))
#             segment_nr += 1
            
#         #all_kg_results.append(current_kg)
#         #print("....................start....................................")
#         #print(current_kg.split("<source_sentence_min_hash:"))
        
#         #print("...............current kg........................")
#         #print(current_kg)
        
#         #kg_String = ''.join(current_kg)
#         all_kg_results.append(current_kg)
        
#         #print(".....................current kg end.........................")
#         #all_reconstruction_results.append(reconstruction_so_far)
#         #print(reconstruction_so_far.split("<source_sentence_min_hash:")[0])
#         #reconstruction_String = ''.join(reconstruction_so_far)
#         all_reconstruction_results.append(reconstruction_so_far)
#         #print("....................end....................................")
        
#         input_string_so_far_list.append(input_string_so_far)
        
        
# #         print("\n")
# #         print("......all_kg_results............")
# #         print("\n")
# #         print(all_kg_results)
        
# #         print("\n")
# #         print("......reconstruction text............")
# #         print("\n")
# #         print(all_reconstruction_results)
#     except:
#         print(i)
#         pass
        

In [None]:
# df = pd.DataFrame({
#     'Input_Texts': input_string_so_far_list,
#     'Output_Graphs': all_kg_results,
#     'Output_Reconstructions': all_reconstruction_results, })


# print(df)

# # print("500 word sample evalution:", "\n")
# # base_cap_500, original_cap_500, knowledgegraph_cap_500, reconstruction_cap_500,QA_df = evaluate_peformance(df, 2,
# #                                                                                                      "q_a_kg.parquet")

# # print("No context correct answer percentage:", base_cap_500, "\n")
# # print("Original context correct answer percentage:", original_cap_500, "\n")
# # print("Knowledgegraph context correct answer percentage:", knowledgegraph_cap_500, "\n")
# # print("Reconstruckted text context correct answer percentage:", reconstruction_cap_500, "\n")





In [None]:
# df.to_csv("data/df_save_2500.csv", encoding='utf-8', index=False)
# #QA_df.to_csv("data/questions_answer_save_200.csv", encoding='utf-8', index=False)

In [None]:
#1000 paper for cot: 

# No context correct answer percentage: 44.19784400760939 

# Original context correct answer percentage: 84.14647730437204 

# Knowledgegraph context correct answer percentage: 77.47158824081902 

# Reconstruckted text context correct answer percentage: 77.27733804656881 

