In [6]:
import pandas as pd

from scripts.style_generation import get_style_genre
from scripts.first_n_words import get_first_n_words
from scripts.llm import ask_LLM
from scripts.kg_content import extract_kg_content
from scripts.minhash_vector import create_minhash_vector
from scripts.reconstruction_content import extract_reconstruction_content
from scripts.evaluate import evaluate_peformance
import scripts.prompts
import scripts.api_key

In [7]:
# Load the dataset from Hugging Face
dataset = pd.read_csv("dataset/ML-Arxiv-Papers.csv")
rows, columns = dataset.shape
# Extract the 'train' split
#train_dataset = dataset["train"]

# Create lists for titles and abstracts
# titles = [entry['title'] for entry in train_dataset]
# abstracts = [entry['abstract'] for entry in train_dataset]

# Create a list with concatenated title and abstract for each sample
concatenated_texts = dataset['abstract'] #[f"{title} {abstract}" for title, abstract in zip(titles, abstracts)]

API_KEY = scripts.api_key.API_KEY


stop_len = 5000

model_name = "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"
system_prompt = "You are a very smart very intelligence assistant who is very helpful."

all_kg_results = []
all_reconstruction_results = []
input_string_so_far_list = []


In [10]:
rows

117592

In [8]:
for input_text in concatenated_texts[:1000]:

    try:

        writing_style = get_style_genre(get_first_n_words(input_text, 1000))

        # sentences= text_to_sentences(input_text)
        # sentences =sentences_to_large_strings(sentences)
        sentences = [input_text]
        # print(sentences)
        # continue
        current_kg = []
        current_kg.append("<style_analysis>" + writing_style + "</style_analysis>")
        print("<style_analysis>" + writing_style + "</style_analysis>")
        segment_nr = 1
        reconstruction_so_far = ""
        input_string_so_far = ""
        for sentence in sentences:
            input_string_so_far += sentence
            if len(input_string_so_far) > stop_len:
                break
            print("INPUT:", sentence)
            # print("-----")
            # '''
            # prompt="""INPUT_TEXT:
            # """+sentence+"""
            # INSTRUCTION:
            # Paraphrase the given input text so that every statement is rephrased into sentences that contain only three to ten words each.
            #   Use a simple structure and make sure to retain all information, names, numbers, and dates from the original text, without losing
            #     any information. The output text should consist exclusively of factual, neutrally phrased sentences that are three to ten words
            #       long. All information must be preserved, but without any artistic nuances. Direct speech in the source text should not be
            #         replicated as such, but it should be laid out in short sentences who said or did what in which order, ensuring a neutral,
            #           information-rich text."""
    
            # reply = ask_LLM ('NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO',
            #   "You are a very smart very intelligence assistant who is very helpful.",
            #     input_text , API_KEY ,temperature=0.5,top_p=0.95,max_tokens=1000, frequency_penalty=1.1,presence_penalty=1.1)
            # '''

            # Determine the slice of the last 50 elements (if the list has more than 50 elements)
            current_kg_context = current_kg[-50:] if len(current_kg) > 50 else current_kg

            # Concatenate the elements into a single string
            current_kg_context = ' '.join(current_kg_context)
            text = scripts.prompts.KG_format_example_prompt(current_kg_context, sentence)

            for i in range(2):
                knowledge_graph_segment = ask_LLM('NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO',
                                                "You are a very smart very intelligence assistant who is very helpful.",
                                                text, API_KEY, temperature=0.5, top_p=0.95, max_tokens=1000,
                                                frequency_penalty=1.1, presence_penalty=1.1)
                if not (extract_kg_content(knowledge_graph_segment) == None):
                    break
            try:
                current_kg.append("<segment " + str(segment_nr) + ">\n" + extract_kg_content(
                    knowledge_graph_segment) + "<source_sentence_min_hash: " + str(
                    create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
                print("<segment " + str(segment_nr) + ">\n" + extract_kg_content(
                    knowledge_graph_segment) + "<source_sentence_min_hash: " + str(
                    create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
            except:
                current_kg.append(
                    "<segment " + str(segment_nr) + ">\n" + knowledge_graph_segment + "<source_sentence_min_hash: " + str(
                        create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")
                print("<segment " + str(segment_nr) + ">\n" + knowledge_graph_segment + "<source_sentence_min_hash: " + str(
                    create_minhash_vector(sentence)) + " >\n" + "</segment " + str(segment_nr) + ">\n")

            prompt = scripts.prompts.KG_reconstruction_prompt(reconstruction_so_far, current_kg)
            for i in range(2):
                next_reconstruction = ask_LLM('NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO',
                                            "You are a very smart very intelligence assistant who is very helpful.",
                                            prompt, API_KEY, temperature=0.5, top_p=0.95, max_tokens=4000,
                                            frequency_penalty=1.1, presence_penalty=1.1)
                if not (extract_reconstruction_content(next_reconstruction) == None):
                    break

            reconstruction_so_far += extract_reconstruction_content(next_reconstruction)
            print(extract_reconstruction_content(next_reconstruction))
            segment_nr += 1
        all_kg_results.append(current_kg)
        all_reconstruction_results.append(reconstruction_so_far)
        input_string_so_far_list.append(input_string_so_far)
    except:
        pass
        

<style_analysis>The provided text demonstrates a highly formal and academic writing style, befitting its subject matter. The language is precise, technical, and discipline-specific, with a focus on clarity and simplicity. This is evident in the use of specialized terminology and the consistent employment of mathematical notation.
In terms of genre, the text can be classified as a technical or academic paper, specifically within the field of statistical learning. It does not fit neatly into any of the other genres listed, as it lacks the narrative elements and creative language associated with those categories.
The rhythm and flow of the text are dictated by the structure of the argument and the logical progression of the ideas presented. The sentences are generally succinct and straightforward, with a clear focus on conveying information in a clear and concise manner. This is in keeping with the formal, academic style of the text.
The tone of the text is impartial and authoritative, re

TypeError: can only concatenate str (not "NoneType") to str

In [None]:
df = pd.DataFrame({
    'Input_Texts': input_string_so_far_list,
    'Output_Graphs': all_kg_results,
    'Output_Reconstructions': all_reconstruction_results, })


# print(df)

print("500 word sample evalution:", "\n")
base_cap_500, original_cap_500, knowledgegraph_cap_500, reconstruction_cap_500,QA_df = evaluate_peformance(df, 2,
                                                                                                     "q_a_kg.parquet")

print("No context correct answer percentage:", base_cap_500, "\n")
print("Original context correct answer percentage:", original_cap_500, "\n")
print("Knowledgegraph context correct answer percentage:", knowledgegraph_cap_500, "\n")
print("Reconstruckted text context correct answer percentage:", reconstruction_cap_500, "\n")





500 word sample evalution: 

questions, correct_answers  ['What is the main goal of statistical learning in the given context?\nA) To estimate the joint distribution of $(X,Y)$\nB) To construct a predictor of a random variable $Y$ as a function of $X$\nC) To determine the optimal encoding of $Y$-values for transmission\nD) To analyze the underlying family of probability distributions', 'In the context of statistical learning, what is the bit rate constraint mentioned?\nA) The $X$-part of the sample can only be communicated at a finite bit rate\nB) The $Y$-part of the sample can only be communicated at a finite bit rate\nC) The encoding of $X$-values is allowed to depend on the $Y$-values\nD) The encoding of $X$-values is not allowed to depend on the $Y$-values'] ['B', 'B']
questions, correct_answers  [''] ['1']
What is the main goal of statistical learning in the given context?
A) To estimate the joint distribution of $(X,Y)$
B) To construct a predictor of a random variable $Y$ as a fu

In [None]:
df.to_csv("dataset/df_save.csv", encoding='utf-8', index=False)
QA_df.to_csv("dataset/questions_answer_save.csv", encoding='utf-8', index=False)

In [None]:
# No context correct answer percentage: 41.666666666666664 

# Original context correct answer percentage: 80.0 

# Knowledgegraph context correct answer percentage: 58.33333333333333 

# Reconstruckted text context correct answer percentage: 65.0 

#with the COT prompt (only add by one line )

# No context correct answer percentage: 38.33333333333333 

# Original context correct answer percentage: 76.66666666666666 

# Knowledgegraph context correct answer percentage: 79.62962962962962 

# Reconstruckted text context correct answer percentage: 66.66666666666666 