# Librerie

In [None]:
# Installazione delle librerie necessarie con specifiche per l'uso della GPU
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.76 numpy==1.21.6 scipy==1.7.3 sentence_transformers==2.2.2 --force-reinstall --upgrade --no-cache-dir --verbose
!pip install huggingface_hub
!pip install openpyxl
!pip install pandas
!pip install langchain
!pip install langchain-chroma
!pip install langchain-community
!pip install faiss-cpu
!pip install flask-ngrok
!pip install pyngrok

In [None]:
# Importing necessary libraries
import spacy
import json
import os
import re
import csv
import openpyxl
import faiss
import difflib
import pandas as pd
import numpy as np
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
from pyngrok import ngrok
from flask import Flask, request, jsonify
from flask_ngrok import run_with_ngrok
from google.colab import drive
drive.mount('/content/drive')

# Chiamata Modello

In [None]:
# Select LLama2 13B as the model
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML"
model_basename = "llama-2-13b-chat.ggmlv3.q5_1.bin"

# Download the model from Hugging Face Hub
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

# Initialize the model with configuration for GPU
lcpp_llm = Llama(
    model_path=model_path,
    n_threads=2,  # Number of CPU cores
    n_batch=512,  # Batch size, depends on GPU VRAM
    n_gpu_layers=32  # Number of layers handled by the GPU
)
lcpp_llm.params.n_gpu_layers

In [None]:
# Response generation
def call_model(prompt):
  response = lcpp_llm(prompt=prompt, max_tokens=4096)
  return response["choices"][0]["text"]

# Vector Store Few Shots

In [None]:
# Initialization of the embedding model
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
# Input and output data examples
inputs = [
    "According to your understanding of Automaton...",
    "What is the final state of an automaton",
    "How are transitions defined",
    "How is an automaton defined",
    "what are the symbols accepted by the automaton",
    "nodes",
    "Give me a list of the arcs",
    "what is the initial stage of the automaton",
    "Tell me a little bit about the automaton",
    "transitions",
    "q0",
    "Please describe the transitions",
    "which is final state",
    "could be this automaton determinated",
    "There is a transition between q5 and q7",
    "What is q1??",
    "Does it only accept 1s and 0s",
    "And how about the arrows",
    "What is the purpose of the automaton",
    "How are your states linked together",
    "what are the transitions from q4",
    "What are the states connected to q0",
    "What does the automaton recognize",
    "How many nodes are there",
    "Can you provide a representation of the automaton",
    "Give me some examples of inputs and output",
    "What is the output",
    "Output",
    "Talk me about transitions",
    "what are the input symbols",
    "if the input is 11100 which is the result",
    "if the input is 110 which is the result",
    "What are the accepted inputs",
    "what are all the transictions",
    "Show me the transitions of the automaton",
    "What transitions does the automaton have",
    "what are the final states",
    "how many final states there are",
    "what is its optimal spatial representation",
    "What can I use finite state automata for",
    "Can I use 5 states in an automaton",
    "How can I define an automaton",
    "What is an example of accepted string",
    "What could be a minimal spatial representation for this automaton",
    "which are the trasitions",
    "good morning",
    "how are you",
    "what's up",
    "maximum number of states in an automaton",
    "According to your understanding of Automaton what is its optimal spatial representation",
    "What is the best way to represent this automa",
    "Summarise the automaton",
    "What are automaton",
    "start states",
    "final states",
    "overview of the automaton",
    "how is the automaton",
    "transition diagram",
    "what s the number of nodes",
    "is 1 a label for some arc",
    "What direction are the connections between the states",
    "what are the states and the connections between states",
    "which are the states connected with 0",
    "what are the patterns",
    "what is a pattern",
    "Which are the transitions from q1",
    "Which are the transitions from q2",
]

outputs = [
    "Tell me more about the automaton",
    "What is the final state",
    "What are the transitions",
    "What is an automaton",
    "What is the alphabet",
    "Tell me more about states",
    "What are the arcs",
    "What is the initial state",
    "Describe me briefly the automaton",
    "What are transitions",
    "What is q0",
    "What are transitions",
    "Tell me more about the final state",
    "Is the automaton deterministic",
    "Transition from q5 to q7 exists",
    "What is q1",
    "What is the language",
    "What are the arcs",
    "Describe me briefly the automaton",
    "What are the arcs",
    "Transitions q4",
    "Transitions q0",
    "What is the language",
    "How many states are there",
    "How is the automaton represented",
    "What is the language",
    "What is the language",
    "What is the language",
    "Tell me about transitions",
    "What is the alphabet",
    "Describe me the automaton",
    "Describe me the automaton",
    "What is the language",
    "What are transitions",
    "What are transitions",
    "What are transitions",
    "What is the final state",
    "What is the final state",
    "How is the automaton represented",
    "Describe me briefly the automaton",
    "What are the states",
    "What is an automaton",
    "What is the alphabet",
    "How is the automaton represented",
    "What are transitions",
    "Hello",
    "Hello",
    "Hello",
    "How many states are there",
    "How is the automaton represented",
    "How is the automaton represented",
    "Describe me briefly the automaton",
    "What is an automaton",
    "What is the initial state",
    "What is the final state",
    "Describe me the automaton",
    "Describe me the automaton",
    "What is an automaton",
    "How many states",
    "What are the arcs",
    "What are the arcs",
    "Tell me more about states",
    "Transitions q0",
    "Tell me the pattern",
    "Tell me the pattern",
    "Which are the transitions from q1",
    "Which are the transitions from q2"
]

In [None]:
# Generation of vectors for input
input_vectors = embedding_model.encode(inputs)

# Creation of the FAISS index
dimension = input_vectors.shape[1]
index_io = faiss.IndexFlatL2(dimension)

# Conversion of vectors to numpy format
input_vectors = np.array(input_vectors, dtype=np.float32)

# Adding vectors to the index
index_io.add(input_vectors)

# Saving metadata (input and output)
metadata_io = [{"input": inputs[i], "output": outputs[i]} for i in range(len(inputs))]

In [None]:
def search_io_vector_store(translation):
    query_vector = embedding_model.encode([translation])[0].astype(np.float32)

    # Number of results
    k = 8

    # Performing the search
    distances, indices = index_io.search(query_vector.reshape(1, -1), k)

    # Check if there are results
    if len(indices[0]) == 0:
        print("No results found.")
        return None
    else:
        # Retrieving the results
        results = []
        for idx in indices[0]:
            result = metadata_io[idx]
            results.append(result)
            if len(results) >= k:
                break
        return results

# Traduzione LNC Few Shots

In [None]:
def format_examples(example_list):
    formatted_examples = ""
    for example in example_list:
        formatted_examples += f"""
Original: "{example['input']}"
Translation: {{"Translation": "{example['output']}"}}"""
    return formatted_examples

In [None]:
llama2PromptTemplate = lambda systemPrompt, userPrompt: f"""
<s>[INST] <<SYS>>
{systemPrompt}
<</SYS>>

{userPrompt} [/INST]
"""

In [None]:
systemPromptFewShots = """
You are an expert on finite state automata. Translate user inputs to a \
controlled natural language. Use precise, technical terminology, and format \
the response as a JSON object with the translation inside the 'Translation' field.
"""

userPromptFewShots = lambda text, example_list: f"""
Please translate the following inputs into controlled natural language.
Use the format provided in the examples:
{format_examples(example_list)}
Now, translate the new input using the same controlled natural language format.
Format the response as a JSON object with the translation inside the 'Translation' field.
Input: "{text}"
Output:
"""

In [None]:
def extract_json_from_output(model_output):
    start = model_output.find('{')
    if start == -1:
        print("Error: No opening brace found")
        return None

    # Start counting braces
    brace_count = 0
    end = start
    for i, char in enumerate(model_output[start:]):
        if char == '{':
            brace_count += 1
        elif char == '}':
            brace_count -= 1
        if brace_count == 0:
            end = start + i + 1
            break

    if brace_count != 0:
        print("Error: Unbalanced braces")
        return None

    json_str = model_output[start:end]

    # Parse the JSON string into a Python dictionary
    try:
        json_output = json.loads(json_str)
        return json_output
    except json.JSONDecodeError:
        print("Error: Unable to decode JSON")
        return None

In [None]:
def translate_CNL(user_input, few_shots):
  model_output = call_model(llama2PromptTemplate(systemPromptFewShots, userPromptFewShots(user_input, few_shots)))
  translation = extract_json_from_output(model_output)
  if translation:
    return translation['Translation'].replace(',', '')
  else:
    return None

# Estrazione Espressioni Regolari e Ricerca per Pertinenza RE

In [None]:
# Function to extract regular expressions from an AIML file
def extract_patterns(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        file_content = file.read()
    pattern_regex = re.compile(r'<pattern>(.*?)</pattern>', re.DOTALL)
    patterns = pattern_regex.findall(file_content)
    # Filter patterns that consist solely of '*'
    cleaned_patterns = [pattern for pattern in patterns if pattern.strip() != '*']
    return cleaned_patterns

# Function to preprocess the patterns
def preprocess_pattern(pattern):
    return re.sub(r'\*', '', pattern)

# Function to calculate similarity with wildcard
def wildcard_similarity(pattern, query):
    query = query.upper()
    pattern = pattern.replace('*', '.*')
    return bool(re.fullmatch(pattern, query))

In [None]:
# Initialization of the embedding model
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Specify the path to your AIML file
file_path = '/content/drive/My Drive/Llama2_Translation_LNC/novagraphABCv2.0.aiml.xml'

# Extract regular expressions
patterns = extract_patterns(file_path)

# Preprocess the patterns
preprocessed_patterns = [preprocess_pattern(pattern) for pattern in patterns]

# Generate vectors for the preprocessed patterns
pattern_vectors = embedding_model.encode(preprocessed_patterns)

# Creation of the FAISS index for patterns
dimension = pattern_vectors.shape[1]
index_pattern = faiss.IndexFlatL2(dimension)

# Conversion of vectors to numpy format
pattern_vectors = np.array(pattern_vectors, dtype=np.float32)

# Adding vectors to the index
index_pattern.add(pattern_vectors)

# Saving metadata (regular expressions)
metadata_pattern = [{"pattern": patterns[i], "preprocessed_pattern": preprocessed_patterns[i]} for i in range(len(patterns))]

In [None]:
# Function to search for the most relevant regular expression
def search_regex_vector_store(query):
    if query:
        query_vector = embedding_model.encode([query])[0].astype(np.float32)
        k = 10

        # Perform the search
        distances, indices = index_pattern.search(query_vector.reshape(1, -1), k)

        # Retrieve candidates
        candidate_patterns = [metadata_pattern[idx] for idx in indices[0]]

        # Refine results using wildcard similarity
        best_match = None
        highest_similarity = 0.0
        similarity = -1
        exact_matches = []

        for candidate in candidate_patterns:
            pattern = candidate['pattern']
            if wildcard_similarity(pattern, query):
                similarity = 1.0  # Set similarity to 1 if there's an exact match
                exact_matches.append((pattern, similarity))

            if similarity > highest_similarity:
                highest_similarity = similarity
                best_match = pattern

        # If there are exact matches, take the one with the highest similarity
        if exact_matches:
            exact_matches.sort(key=lambda x: x[1], reverse=True)
            best_match, highest_similarity = exact_matches[0]

        if best_match:
            return best_match, highest_similarity
        else:
            # If no exact wildcard match is found, return the closest based on embedding
            closest_pattern = candidate_patterns[0]['pattern']
            closest_similarity = 1 / (1 + distances[0][0])
            return closest_pattern, closest_similarity
    else:
        return None, None

# Traduzione LNC - RE

In [None]:
systemPrompt_AIML_LNC = """
You are an assistant that transforms questions into a controlled natural \
language format using a regular expression. Respond only with the JSON object \
containing the translation. Use the regular expression to ensure the structure is \
correct. Do not include any additional text.
"""

userPrompt_AIML_LNC = lambda translation, regular_expression: f"""
Translate the input into a controlled natural language using the regular expression.
Respond only with the JSON object.

Example 1:
Input: "what is the initial stage"
Regular Expression: INITIAL STATE
JSON Translation: {{
  "Translation": "what is the initial state"
}}

Example 2:
Input: "tell me the transitions"
Regular Expression: TRANSITIONS
JSON Translation: {{
 "Translation": "transitions"
}}

Example 3:
Input: "What the final state is?"
Regular Expression: * FINAL STATE
JSON Translation: {{
  "Translation": "what is the final state"
}}

Task:
Input: {translation}
Regular Expression: {regular_expression}
Provide the JSON translation below:
"""

In [None]:
def translate_CNL_AIML(translation, regular_expression):
  output = call_model(llama2PromptTemplate(systemPrompt_AIML_LNC, userPrompt_AIML_LNC(translation, regular_expression)))
  extraction = extract_json_from_output(output)
  if extraction:
    return extraction['Translation'].replace(',', '')
  else:
    return None

# Esecuzione Codice

In [None]:
def execution(user_input):
  print("Input: ", user_input)
  # Check if it's already a relevant question, if there is a match with a regular expression
  most_relevant_pattern, similarity_score = search_regex_vector_store(user_input)
  print("PATTERN 1: ", most_relevant_pattern, " Similarity Score: ", similarity_score)
  if most_relevant_pattern and similarity_score is not None:
    if similarity_score != 1:
      # Search for few shots for relevance within the vector store
      few_shots = search_io_vector_store(user_input)
      # Generate a translation in LNC using the found few shots
      translation = translate_CNL(user_input, few_shots)
      print("TRANSLATION 1: ", translation)
      # Search for the most relevant regular expression
      if translation:
        most_relevant_pattern, similarity_score = search_regex_vector_store(translation)
        print("PATTERN 2: ", most_relevant_pattern, " Similarity Score: ", similarity_score)
        if most_relevant_pattern and similarity_score is not None and similarity_score < 0.8 and similarity_score > 0.1:
          # Generate a second translation in LNC combining the AIML file and the first translation
          final_output = translate_CNL_AIML(translation, most_relevant_pattern)
          print("TRANSLATION 2: ", final_output)
          if final_output:
            return final_output
          else:
            return translation
        else:
          return translation
      else:
        return user_input
    else:
      return user_input
  else:
    return None

# Server Flask

In [None]:
authtoken = "2iNKZcAwjAZQZRfbHkWQCNZKThb_7DRmPbXvHG2L3CaVF8yfK"
ngrok.set_auth_token(authtoken)

# Start ngrok on port 5000
public_url = ngrok.connect(5000)
print(f"Ngrok URL: {public_url}")

In [None]:
app = Flask(__name__)
# Starts ngrok when the app is run
run_with_ngrok(app)

@app.route('/submit', methods=['POST'])
def submit():
    data = request.get_json()
    user_input = data.get('input')

    # Processing the input
    print("Input", user_input)
    user_input = execution(user_input)
    print("Output", user_input)
    # JSON response
    response = {
        "status": "success",
        "input_received": user_input
    }

    return jsonify(response)

if __name__ == '__main__':
    app.run()