# Import Library

In [None]:
import os
import re
import json 
import pickle
import random
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tabulate import tabulate
from datetime import datetime
from pymongo import MongoClient
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support, classification_report


In [None]:
import logging
import warnings

# create logger
logger = logging.getLogger('__name__')
logger.setLevel(logging.ERROR)
warnings.filterwarnings('ignore')

# Data Loading and Processing

### Load Training Data from MongoDB

In [None]:
# Connect to MongoDB
import os

DB_USERNAME = os.environ.get('DB_USERNAME')
DB_PASSWORD = os.environ.get('DB_PASSWORD')
DB_NAME = os.environ.get('DB_NAME')
DB_HOST = os.environ.get('DB_HOST')

client = MongoClient(host=DB_HOST,port=27017,username=DB_USERNAME,password=DB_PASSWORD,authSource=DB_NAME)
database = client[DB_NAME]

In [None]:
print(database.iFActJobs.count_documents({}))

In [None]:
db_list = client.list_database_names()
print(db_list)

In [None]:

iFActJobs_df=pd.DataFrame(database.iFActJobs.find({'Projectnr':{'$regex':'....[AS]'}}))
#iFActJobs_df

In [None]:
iFActJobs_df

In [None]:
%%time
iFActPredMulti_df = pd.DataFrame(database.iFActPredMulti.find({'Projectnr':{'$regex':'....[AS]'}}))

In [None]:
iFActPredMulti_df

In [None]:
print("Jobs:",iFActJobs_df.shape)
print("Labels:",iFActPredMulti_df.shape)

In [None]:
iFActPredMulti_labelled_df=iFActPredMulti_df[iFActPredMulti_df.Emean==1]
iFActPredMulti_labelled_df

In [None]:
iFActPredMulti_df.Emean.info()

In [None]:
# %%time
# merge to get the labeled training data
FAJobTrainingData_df=pd.merge(iFActPredMulti_labelled_df, iFActJobs_df, left_on='Projectnr',right_on='Projectnr', suffixes=[None,'_2'])
FAJobTrainingData_df.info()

### Load Unlabelled Data from Oracle

In [None]:
# fetch all ifact data

import cx_Oracle
import platform

# This is the path to the ORACLE client files
lib_dir = r".\instantclient_21_12"

# Diagnostic output to verify 64 bit arch and list files
print("ARCH:", platform.architecture())
print("FILES AT lib_dir: ")

try:
    cx_Oracle.init_oracle_client(lib_dir=lib_dir)
except Exception as err:
    print("Error connecting: cx_Oracle.init_oracle_client()")
    print(err)
    
# Test to see if the cx_Oracle is recognized
print('Oracle version: ', cx_Oracle.version)   

# This fails for me at this point but will succeed after the solution described below
cx_Oracle.clientversion() 

oracle_config = {
    'username' : 'reader',
    'password' : 'reader',
    'dsn' : "ifact.muc.infineon.com",
}

def oracle_conn(user, password, dsn):    
    oracle_conn = cx_Oracle.connect( user = user,
                                     password = password, 
                                     dsn = dsn)
    return oracle_conn

In [None]:
# Query samples
job_site = 'MC'
query_smpl = """
SELECT DISTINCT JOB_ID, JOB_PROJECT_NUMBER, JOB_PRODUCT, JOB_SUMMARY, JOB_COMMENT 
FROM READER.V_JOB_TEXTS
WHERE JOB_PROJECT_NUMBER LIKE '{job_site}%'
ORDER BY dbms_random.value
OFFSET 0 ROWS FETCH NEXT 30000 ROWS ONLY
"""
ifact_df = pd.read_sql(query_smpl.format(job_site = job_site), 
                  con = oracle_conn(user=oracle_config['username'], 
                                    password=oracle_config['password'], 
                                    dsn=oracle_config['dsn']))

In [None]:
# find the unlabelled data
labelled_job_ids = list(set(FAJobTrainingData_df['Projectnr']))
unlabelled_df = ifact_df[~ifact_df['JOB_PROJECT_NUMBER'].isin(labelled_job_ids)]
unlabelled_df.shape

### Preprocess The Data

In [None]:
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

def get_lang_detector(nlp, name):
    return LanguageDetector()


try:
    nlp = spacy.load("en_core_web_sm")
    Language.factory("language_detector", func=LanguageDetector())
    nlp.add_pipe('language_detector', last=True)
except Exception as e:
    print(e)


# text = 'This is an english text.'
# print(doc._.language)


def detect_lang(text):
    try:
        if text is not None:
            doc = nlp(text)
            return doc._.language['language']
        else:
            return 'en'
    except Exception as e:
        return 'en'

In [None]:
def preprocess_input_text(text):
    if pd.isna(text):
        return ''
    else:
        # Remove spaces/tabs/newlines
        processed_text = re.sub('\s+',' ', text)
        
        # Remove punctuation
        processed_text = processed_text.replace("#", "").replace("(", "").replace(")", "").replace("=", "")
        # processed_text = processed_text.replace("&", "and")

        return processed_text


In [None]:
def preprocess_training_data(training_df):
    # print(training_data.info())

    # Drop duplicated rows
    # preprocess_training_data = training_df.applymap(lambda x: str(x) if not pd.isna(x) else '')
    # preprocess_training_data = training_df.drop_duplicates().reset_index(drop=True)
    # print('Drop duplicated rows: ', len(training_df))
    
    # Preprocess the text  
    print('Preprocess the text data')
    training_df['Processed_JobComment'] = training_df['JobComment'].apply(preprocess_input_text)
    training_df['Processed_JobSummary'] = training_df['JobSummary'].apply(preprocess_input_text)

    # split the electrical label list 
    # print('Split the electrical label list')
    # training_df['ElPred_List'] = training_df['ElPred'].apply(split_label)

    # detect the languages
    print('Detect the languages of job summary and job comments')
    training_df['Processed_JobComment_Lang'] = training_df['Processed_JobComment'].apply(detect_lang)
    training_df['Processed_JobSummary_Lang'] = training_df['Processed_JobSummary'].apply(detect_lang)

    return training_df

In [None]:
preprocessed_training_df = preprocess_training_data(FAJobTrainingData_df)


In [None]:
preprocessed_training_df[preprocessed_training_df['JobComment_Lang'] == 'de'][['JobComment', 'JobComment_Lang']]

In [None]:

def preprocess_unlabelled_data(unlabelled_df):
    # print(training_data.info())
    
    # Preprocess the text  
    print('Preprocess the text data')
    unlabelled_df['Processed_JobComment'] = unlabelled_df['JOB_COMMENT'].apply(preprocess_input_text)
    unlabelled_df['Processed_JobSummary'] = unlabelled_df['JOB_SUMMARY'].apply(preprocess_input_text)

 
    # detect the languages
    print('Detect the languages of job summary and job comments')
    unlabelled_df['Processed_JobComment_Lang'] = unlabelled_df['Processed_JobComment'].apply(detect_lang)
    unlabelled_df['Processed_JobSummary_Lang'] = unlabelled_df['Processed_JobSummary'].apply(detect_lang)

    return unlabelled_df 


In [None]:
preprocessed_unlabelled_df = preprocess_unlabelled_data(unlabelled_df)

In [None]:
# preprocessed_training_df = preprocessed_training_df.rename(columns={"Processed_JobSummary_Lang":"JobSummary_Lang", "Processed_JobComment_Lang": "JobComment_Lang"})

In [None]:
# preprocessed_unlabelled_df = preprocessed_unlabelled_df.rename(columns={"Processed_JobSummary_Lang":"JobSummary_Lang", "Processed_JobComment_Lang": "JobComment_Lang"})

preprocessed_unlabelled_df.info()

In [None]:
preprocessed_training_df.info()

# Load the Electrical Failure Labels from Neo4j

In [None]:
from py2neo import Graph

# Prod
prod_config = {
    "uri": os.environ.get('NEO4J_URI'),
    "user": os.environ.get('NEO4J_USER'),
    "pwd": os.environ.get('NEO4J_PWD')
}

graph = Graph(prod_config['uri'], auth=(prod_config['user'], prod_config['pwd']))

In [None]:
def find_all_elf(graph):
    # get all Electrical Nodes in Ontology
    query="Match (n:ElectricalFailure) return n"
    res=graph.run(query)
    ElFaults= [[record["n"]['Name'], record["n"]['Keywords'], record["n"]['comment'], record["n"]['explanation']] for record in res]
    elf_df = pd.DataFrame(ElFaults, columns=['Name', 'Keywords', 'Comment', 'Explanation'])
    elf_df['Parsed_Label'] = elf_df['Name'].apply(parse_elf_name)
    return elf_df

def parse_elf_name(label): 
    prefix = 'ElFault'
    if label.startswith(prefix):
        return label[len(prefix):].lower()
    return label.lower()
    
def find_elf_groups(graph):
    # get all parent Electrical Failure Nodes 
    query="""
    Match (parent)<-[:is_a]-(n)
    WHERE parent.OntoName = "ElectricalFailure"
    return n"""
    res=graph.run(query)
    ElFaults= [[record["n"]['Name'], record["n"]['Keywords'], record["n"]['comment'], record["n"]['explanation']] for record in res]
    elf_df = pd.DataFrame(ElFaults, columns=['Name', 'Keywords', 'Comment', 'Explanation'])
    elf_df['Parsed_Label'] = elf_df['Name'].apply(parse_elf_name)
    return elf_df


def find_elf_children(graph, elf_groups):
    # get all Electrical Nodes in Ontology 
    query=f"""
    Match (parent)<-[:is_a]-(n)
    WHERE parent.OntoName IN {str(elf_groups)}
    return n"""
    res=graph.run(query)
    ElFaults= [[record["n"]['Name'], record["n"]['Keywords'], record["n"]['comment'], record["n"]['explanation']] for record in res]
    elf_df = pd.DataFrame(ElFaults, columns=['Name', 'Keywords', 'Comment', 'Explanation'])
    elf_df['Parsed_Label'] = elf_df['Name'].apply(parse_elf_name)
    return elf_df


def find_elf_parents(graph, elf_children):
    # get all Electrical Nodes in Ontology
    query=f"""
    MATCH (root {{OntoName: 'ElectricalFailure'}})<-[:is_a]-(n)<-[:is_a]-(children)
    WHERE children.OntoName IN {str(elf_children)}
    RETURN n"""
    res=graph.run(query)
    ElFaults= [[record["n"]['Name'], record["n"]['Keywords'], record["n"]['comment'], record["n"]['explanation']] for record in res]
    elf_df = pd.DataFrame(ElFaults, columns=['Name', 'Keywords', 'Comment', 'Explanation'])
    elf_df['Parsed_Label'] = elf_df['Name'].apply(parse_elf_name)
    return elf_df.drop_duplicates().reset_index(drop = True)


def load_elf_df(graph):
    query = """
    Match (parent)<-[:is_a]-(n)
    WHERE parent.OntoName = "ElectricalFailure"
    return n"""
    res=graph.run(query)
    elf_parents = [[record["n"]['Name'], record["n"]['Keywords'], record["n"]['comment'], record["n"]['explanation'], 
        parse_elf_name(record["n"]['Name']), 'Parent', None] for record in res if record["n"]['Name'] != 'Probing']
   
    elf_children_all = []
    # elf_parents = elf_parents[elf_parents['']]

    for i, parent in enumerate(elf_parents):
        # get all children nodes
        query=f"""
        Match (parent)<-[:is_a]-(n)
        WHERE parent.OntoName = "{parent[0]}"
        return n"""
        res=graph.run(query)
        elf_children = [[record["n"]['Name'], record["n"]['Keywords'], record["n"]['comment'], record["n"]['explanation'], 
            parse_elf_name(record["n"]['Name']),  'Child', parent[4], None] for record in res]
        elf_parents[i].append([child[4] for child in elf_children])
        elf_children_all.extend(elf_children)
    elf_parents_df = pd.DataFrame(elf_parents, columns=['Name', 'Keywords', 'Comment', 'Explanation', 'Parsed_Label', 'Type', 'Parent', 'Children'])
    elf_children_df = pd.DataFrame(elf_children_all, columns=['Name', 'Keywords', 'Comment', 'Explanation', 'Parsed_Label', 'Type', 'Parent', 'Children'])
    
    elf_df = pd.concat([elf_parents_df, elf_children_df], axis=0, ignore_index=True)

    return elf_df, elf_parents_df, elf_children_df

In [None]:
elf_df, elf_parents_df, elf_children_df = load_elf_df(graph)


In [None]:
elf_df.info()

In [None]:
# elf_parents_df

# Prompt Engineering

# Load Training Data

### Load & Process the Data

In [None]:


# Find the matched label
def match_elf_label(label, elf_df):
    row = elf_df[elf_df['Parsed_Label'].str.lower() == label.lower()]
    return row
# label_counts = preprocessed_training_df['ElPred_List'].explode().value_counts()x

def find_valid_labels(training_df, elf_df):
    label_counts = training_df['ElPred_List'].explode().value_counts()

    valid_labels = []
    invalid_labels = []
    for i, label in enumerate(label_counts.index):
        matched_label =  match_elf_label(label, elf_df)
        if matched_label.empty: 
            invalid_labels.append(label)
            # print('#', i, label, 'label not found')
        else:
            valid_labels.append(label)
            # print('#', i, label, 'label found')
    print(len(valid_labels), valid_labels)
    print(len(invalid_labels), invalid_labels)
    return valid_labels
 

In [None]:
TRAIN_SET_COLS = [
    "Projectnr",  
    'JobSummary', 
    'JobComment', 
    'Processed_JobComment', 
    'Processed_JobSummary', 
    'Processed_JobComment_Lang',
    'Processed_JobSummary_Lang', 
    'TechGroup',
    'TechGroup_2',
    "ElPred",
    ]

# Split the label list
def split_label(label):
    # split the text by ';' and remove any empty strings or whitespace-only strings
    label = label.replace('.', '') 
    return [item.strip() for item in label.split(';') if item.strip()]


def filter_samples_by_language(df, cols, language='en'):
    mask = None
    for col in cols:
        condition = df[col] == language 
        if mask is None:
            mask = condition
        else:
            mask &= condition  
    return df[mask]


def validate_label(parent_label, child_label, elf_df):
    # remove the non electrical failure
    # if parent_label == 'not_applicable':
    #     return False
    
    # remove the labels who do not follow the ontology rules
    if parent_label is None:
        return False
    elif (child_label is not None) and (child_label not in list(elf_df.loc[elf_df['Parsed_Label'] == parent_label, 'Children'].iloc[0])):
        return False
    return True
   
    
def format_tech_group(label):
    if not pd.isna(label):
        prefix = 'TechGroup'
        if label.startswith(prefix):
            return label
        return prefix+label
    return label


def format_labels(label, parents_df, children_df):
    label_list = split_label(label.lower())
    parent_label = None
    child_label = None
    
    if len(label_list) == 1:
        if label_list[0] in list(parents_df['Parsed_Label']):
            parent_label = label_list[0].lower()
            child_label = None
    
        elif label_list[0] in list(children_df['Parsed_Label']):
            parent_label = children_df.loc[children_df['Parsed_Label'] == label_list[0], 'Parent'].iloc[0].lower()
            child_label = label_list[0] .lower()

    elif len(label_list) == 2:
        parent_label = label_list[0].lower()
        child_label = None if label_list[1].lower() == 'none' else label_list[1].lower()
   
    # Combined label
    if parent_label == None:
        formatted_label = ''
    elif child_label == None:
        formatted_label = parent_label
    else:
        formatted_label = parent_label+';'+child_label
    return [parent_label, child_label, formatted_label]


def process_dataset(df, config):
    # Drop duplicates
    processed_training_data = df.drop_duplicates(
        subset=["Projectnr",  config['target_field']] + config['input_fields'])[TRAIN_SET_COLS]
    
    processed_training_data['ElPred_List'] = processed_training_data['ElPred'].apply(split_label)
    

    # Filter the text in English
    processed_training_data = filter_samples_by_language(processed_training_data, [col+ '_Lang'for col in config['input_fields']], 'en')

    # Only include the jobs with valid tech group
    # processed_training_data = processed_training_data.dropna(subset=['TechGroup'])
    processed_training_data['Parsed_TechGroup'] = processed_training_data['TechGroup'].apply(format_tech_group)

    processed_training_data[['Parent_Label', 'Child_Label', 'Formatted_Label']] = pd.DataFrame(
        processed_training_data['ElPred'].apply(lambda label: format_labels(label, elf_parents_df, elf_children_df)).tolist(), index=processed_training_data.index)
    
    processed_training_data['Is_Valid_Label'] = processed_training_data.apply(lambda row: validate_label(row['Parent_Label'], row['Child_Label'], elf_df), axis=1)
    processed_training_data = processed_training_data[processed_training_data['Is_Valid_Label'] == True]

    # Only include the jobs with valid labels
    # processed_training_data = validate_labels(processed_training_data, elf_df)
    return processed_training_data.reset_index(drop = True)


In [None]:


def load_full_dataset(config):
    # Load the training data
    training_data_path = os.path.join(config['training_data_folder'], config['training_data_name'])
    print(f'Load The Training Data Form {training_data_path}')
    training_data = pd.read_pickle(training_data_path)

    # Preprocess the data
    print(f'Preprocess The Training Data')
    processed_training_data = process_dataset(training_data, config)
    
    return processed_training_data
    

def select_samples_from_label_groups(df, config):
    """Select the samples from each label

    Returns:
        sample_set, val_set
    """
    if len(df) < config['training_data_size']:
        raise ValueError("Not enough remaining data for non-intersecting sample sets")

    # Group the full dataset by labels
    # expanded_df = df.explode('ElPred_List').reset_index(drop=True)
    # grouped_df = df.groupby([config['target_field']])
    grouped_df = df.groupby(['Formatted_Label'])
    samples_per_group = config['training_data_size'] // len(grouped_df)
    remaining_samples_num = config['training_data_size'] % len(grouped_df)
    # select the samples from each group
    
    samples_df = pd.DataFrame()
    for fault in grouped_df.groups.keys():
        fault_df = grouped_df.get_group(fault)
        if len(fault_df) < samples_per_group:
            remaining_samples_num+=samples_per_group-len(fault_df)
            sample_sets = fault_df
        else:
            sample_sets = fault_df.sample(samples_per_group, random_state=config['random_state'])
        samples_df = pd.concat([samples_df, sample_sets], axis=0) 
    
    if remaining_samples_num > 0:
        remaining_indices = [idx for idx in df.index if idx not in samples_df.index]
        random.seed(42)
        remaining_selected = random.sample(remaining_indices, remaining_samples_num)
        remaining_sample_sets = df.loc[remaining_selected]
        samples_df = pd.concat([samples_df, remaining_sample_sets], axis=0)
    
    # Find unseen data
    remaining_indices = [idx for idx in df.index if idx not in samples_df.index]
    remaining_df = df.loc[remaining_indices]
    label_counts = samples_df['Formatted_Label'].value_counts()
    print(label_counts)
    print('Total number of classes: ', len(label_counts))

    return samples_df.reset_index(drop=True), remaining_df


def create_non_intersecting_sample_sets(df, sample_size, set_num):
    if sample_size * set_num > len(df):
        raise ValueError("n * k should not exceed the length of the dataframe")

    sample_sets = []
    indices = list(df.index)
    random.seed(42)  # Set the random seed for reproducibility

    for _ in range(set_num):
        sample_indices = random.sample(indices, sample_size)
        sample_sets.append(df.loc[sample_indices].reset_index(drop=True))
        indices = [idx for idx in indices if idx not in sample_indices]
    
    # Find unseen data
    remaining_df = df.loc[indices]
    return sample_sets, remaining_df


### Check Data Quality

In [None]:
# Check dataset quality
def evaluate_dataset_quality(df, text_cols, lang_cols = None):
    total_len = len(df)
    # Calculate empty value rate for each field
    # empty_value_rate = df[text_cols].isnull().mean()
    empty_value_rate = [df[col].isnull().sum() + (df[col] == '').sum()/total_len for col in text_cols]

    # Calculate token length for each field (split by white space)
    token_length = df[text_cols].apply(lambda col: col.str.split().str.len().mean())
    
    # Detecting the languages
    lang_cols =  [col+ '_Lang'for col in text_cols] if lang_cols == None else lang_cols
    de_count = [(df[col] == 'de').sum() for col in lang_cols]
    en_count = [ total_len- count for count in de_count]
    en_ratio = [count/total_len for count in en_count]
    # en_count = [len(df) - count for count in de_count]

    # Combine the results into a DataFrame
    quality_metrics = pd.DataFrame({
        'Empty Value Rate': empty_value_rate, 
        'Token Length': token_length,
        'DE Count': de_count,
        'EN Count': en_count,
        'EN Ratio':en_ratio 
        })

    return quality_metrics


# LLM

### Aleph Alpha

In [None]:
ALPHA_TOKEN = os.environ.get('ALPHA_TOKEN')
ALPHA_URL = os.environ.get('ALPHA_URL')

In [None]:
def alpha_completion(prompt, config):
    payload = json.dumps({
    "model": config['model_name'],
    "prompt": prompt,
    "maximum_tokens": config['max_tokens'],
    "temperature": config['temperature'],
    "stop_sequences": ["###", "\n"]
    })

    headers = {
    'Content-Type': 'application/json',
    'Accept': 'application/json',
    'cookie': 'token='+ALPHA_TOKEN
    }

    response = requests.request("POST", ALPHA_URL, headers=headers, data=payload, verify=False)
    res = json.loads(response.text)
    if 'error' in res.keys():
        return res
    else:
        return res['completions'][0]['completion']

### Mistral

In [None]:
### Mistral API 'Mistral-7B-Instruct'
OPENAI_URL = os.environ.get('OPENAI_URL')

def mistral_completion(prompt, config):
    # print(create_prompt_mistral(input))
    payload = json.dumps({
    "model": config['model_name'],
    "prompt": prompt,
    "max_tokens": config['max_tokens'],
    "temperature": config['temperature'],
    })
    
    headers = {"Content-Type": "application/json"}

    response = requests.request("POST", OPENAI_URL, headers=headers, data=payload, verify=False)
    res = json.loads(response.text)
    if response.status_code != 200:
        return res
    else:
        return res['choices'][0]['text']

### IFXGPT

In [None]:
### Mistral API 'Mistral-7B-Instruct'
gpt4ifx_url = os.environ.get('GPT4IFX_URL')
gpt4ifx_user=os.environ.get('GPT4IFX_USER')
gpt4ifx_password=os.environ.get('GPT4IFX_PASSWORD')

def mixtral_completion(prompt, config):
    # print(create_prompt_mistral(input))
    payload = json.dumps({
    "model": config['model_name'],
    "messages": [{
      "role": "user",
      "content": prompt
    }],
    "n": 1,
    "max_tokens": config['max_tokens'],
    "temperature": config['temperature'],
    })
    
    headers = {"Content-Type": "application/json"}

    response = requests.request("POST", gpt4ifx_url, headers=headers, auth=(gpt4ifx_user,gpt4ifx_password), data=payload, verify=False)
    res = json.loads(response.text)
    if response.status_code != 200:
        return res
    else:
        return res['choices'][0]['message']['content']

### Few Shots Learning

##### Few-shots sample selections

In [None]:
def select_manual_samples(df, sample_indexes):
    return df.iloc[sample_indexes]


def select_random_samples(df, shots_num, random_state):
    """Function to randomly select the random samples
    """
    # randomly select sample from dataframe
    samples = df.sample(n=shots_num, random_state=random_state) if len(df) > shots_num else df
    return samples


def select_similar_samples(input_text, df, tfidf_matrix, vectorizer, shots_num):
    """Function to select similar text examples
    """
    df['Similarity'] =  cosine_similarity(vectorizer.transform([input_text]), tfidf_matrix)[0]
    selected_samples = df.nlargest(shots_num, 'Similarity')
    return selected_samples

def compute_tfidf_matrix(df, input_fields):
    combined_texts = df[input_fields].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(combined_texts)
    return vectorizer, tfidf_matrix

##### Filter elf

In [None]:
# Preprocess the possible failure modes
def get_all_tech_groups():
    # Get tech group
    onto_url = os.environ.get('ONTO_URL')
    response = requests.get(url = onto_url,verify = False)
    res = json.loads(response.text)
    return res['response']

def filter_elf_by_tech_group(tech_group, elf_df):
    """ Find the possible electrical fault with given tech group
    """
    if tech_group == None:
        return elf_df
    try:
        onto_url = os.environ.get('ONTO_URL') + f'?tech_group={tech_group}'
        response = requests.get(url = onto_url,verify = False)
        res = json.loads(response.text)
        possible_elf = [label.lower() for label in json.loads(res['response']) ]
        # print(possible_elf)
        return elf_df[elf_df['Parsed_Label'].isin(possible_elf) | elf_df['Parent'].isin(possible_elf)]
    except Exception as e:
        print(f'Error getting the electrical fauilure related to {tech_group}, {e}')
        return elf_df

def filter_elf_by_keywords(input_texts, elf_df):
    # filterd elf_df with keywords
    # elf_df_filtered = elf_df[elf_df['Keywords'].notna()].copy()

    if 'split_keywords' not in elf_df.columns:
        if 'Keywords' in elf_df.columns:
            valid_keywords = elf_df['Keywords'].dropna().astype(str)
            elf_df['split_keywords'] = valid_keywords.apply(split_label)

    possible_elf = []
    found_match = False  # Add a flag variable to track if a match is found 

    for _, row in elf_df.iterrows(): 
        if isinstance(row['split_keywords'], list):
            for keyword in row['split_keywords']:
                if keyword in input_texts:
                    possible_elf.append(row['Parsed_Label'])
                    found_match = True
                    break
    # possible_elf = list()
    filtered_elf = elf_df[
        (elf_df['Parsed_Label'].isin(possible_elf)) | 
        (elf_df['Parent'].isin(possible_elf)) |
        (elf_df['Children'].apply(lambda x: isinstance(x, list) and any(item in possible_elf for item in x) if x is not None else False))
    ]
    # If no match was found, return the filterd elf_df
    if not found_match:
        return elf_df
    else:
        return filtered_elf


In [None]:
elf_df

#### Prompt Template

In [None]:
# def create_labelling_prompt_alpha(shots_df, elf_df, input_text, config):
#    task_definiation = """Predict the parent failure mode from the following list based on the report comment. \
# Once the parent failure mode is predicted, select the corresponding child failure mode if applicable. \
# The predictions should be in the format of 'parent_label' or 'parent_label;child_label'. \
# If no suitable parent failure mode can be found, use 'not_applicable' as the label.
# """

def create_labelling_prompt_alpha(shots_df, elf_df, input_text, config):
   task_definiation = """Predict the parent failure mode from the following list based on the report comment. \
Once the parent failure mode is predicted, select the corresponding child failure mode if applicable. \
If no suitable parent failure mode can be found, use 'not_applicable' as the parent label.
"""

   # Provide the list of the ontology
   label_lists = 'Here is the list of parent nodes:\n'
   for i in range(len(elf_parents_df)):
      ontology = elf_parents_df.iloc[i]
      label_lists += f"- {ontology['Parsed_Label']}:"
      if ontology['Keywords'] :
         label_lists += f" has keywords({ontology['Keywords'] })."
      if ontology['Explanation']  is not None:
         label_lists += f" has meaning({ontology['Explanation'] })."
      if ontology['Children'] is not None:
         label_lists += f" has children label ({ontology['Children'] })."
      label_lists += '\n'
   
   label_lists += 'Here is the list of child nodes:\n'
   
   for i in range(len(elf_children_df)):
      ontology = elf_children_df.iloc[i]
      label_lists += f"- {ontology['Parsed_Label']}:"
      if ontology['Keywords'] :
         label_lists += f" has keywords({ontology['Keywords'] })."
      if ontology['Explanation']  is not None:
         label_lists += f" has meaning({ontology['Explanation'] })."
      if ontology['Parent'] is not None:
         label_lists += f" has parent label({ontology['Parent'] })."
      label_lists += '\n'
   # Insert the input text
   input_section = f"""
### 
Comment: {input_text}
Failure:"""
   
   remaining_len = config['max_prompt_len']\
      - len(task_definiation.split())\
      - len(label_lists.split())\
      - len(input_section.split())
   
   shots_text = ''
   # Provide few shots
   for i in range(shots_df.shape[0]):
        sample = shots_df.iloc[i]
        example=f"""
###
Comment: {' '.join([sample[col] for col in config['input_fields']])}
Failure: parent label: {str(sample['Parent_Label'])}, child label: {str(sample['Child_Label'])}"""
        
        example_len = len(example.split())
        if example_len > remaining_len:
            break
        else:
            shots_text += example
            remaining_len -= example_len

   prompt_text = task_definiation + label_lists + shots_text + input_section
   
   return prompt_text


In [None]:
def create_elf_label_list(elf_df, label_rel = False):
    if label_rel:
        elf_parents_df = elf_df[elf_df['Type'] == 'Parent']
        elf_children_df = elf_df[elf_df['Type'] == 'Child']
        
        label_lists = 'Here is the list of parent failure mode labels:\n'
        for i in range(len(elf_parents_df)):
            ontology = elf_parents_df.iloc[i]
            label_lists += f"- Label: {ontology['Parsed_Label']}\n"
            if ontology['Keywords'] :
                label_lists += f" Keywords: {ontology['Keywords']}\n"
            if ontology['Explanation']  is not None:
                label_lists += f" Explanation: {ontology['Explanation']}\n"
            if ontology['Children'] is not None:
                label_lists += f" Children labels: {ontology['Children']}\n"
            label_lists += '\n'
        
        label_lists += 'Here is the list of child failure mode labels:\n'
        
        for i in range(len(elf_children_df)):
            ontology = elf_children_df.iloc[i]
            label_lists += f"- Label: {ontology['Parsed_Label']}\n"
            if ontology['Keywords'] :
                label_lists += f" Keywords: {ontology['Keywords']}\n"
            if ontology['Explanation']  is not None:
                label_lists += f" Explanation: {ontology['Explanation']}\n"
            if ontology['Parent'] is not None:
                label_lists += f" Parent label: {ontology['Parent']}\n"
            label_lists += '\n'
    else:
        label_lists = ''
        
        for i in range(len(elf_df)):
            ontology = elf_df.iloc[i]
            label_lists += f"- Label: {ontology['Parsed_Label']}\n"
            if ontology['Keywords'] :
                label_lists += f" Keywords: {ontology['Keywords']}\n"
            if ontology['Explanation']  is not None:
                label_lists += f" Explanation: {ontology['Explanation']}\n"
            label_lists += '\n'
    
    return label_lists

In [None]:
def create_few_shots_text(shots_df, input_fields, target_fields, max_token_len):
    shots_text = ''
    for i in range(shots_df.shape[0]):
        sample = shots_df.iloc[i]

        if len(target_fields) == 1:
            example =f"""
Example {i + 1}:
Job Comment: {' '.join([sample[col] for col in input_fields])}
Failure: {str(sample[target_fields[0]])}"""
        else:
            example =f"""
Example {i + 1}:
Job Comment: {' '.join([sample[col] for col in input_fields])}
Failure: parent label: {str(sample[target_fields[0]])}, child label: {str(sample[target_fields[1]])}"""

        example_len = len(example.split())
        if example_len > max_token_len:
            break
        else:
            shots_text += example
            max_token_len -= example_len
    
    return shots_text

In [None]:
def create_labelling_prompt_mistral(shots_df, elf_df, input_text, config, label_rel):
    task_definiation = """[INST]
You are a classifier, and your task is to analyze the given 'Job Comment' and assign the appropriate the failure mode from the provided label list based on the report comment. \
First predict the parent failure mode from the following list based on the report comment. \
Once the parent failure mode is predicted, select only one corresponding child failure mode if applicable. \
If no suitable parent failure mode can be found, parent label should be 'not_applicable', child label should be 'None'. \
The Failure should always in the format of "parent label: parent failure mode label, child label: child failure mode label".\
Your output should only contain the predicted Failure in the required format and do not include any explanations.
<<<
"""
 
    label_lists = 'Here is the list of failure modes:\n'
    label_lists += create_elf_label_list(elf_df, label_rel)

    task_guidelines = f""">>>

###
Here are some examples to guide your labeling process:
"""
    
    input_section = f"""
###
Now, please label the following job comment:

Job Comment: {input_text}
Failure:
[/INST]
"""

    remaining_len = config['max_prompt_len']\
      - len(task_definiation.split())\
      - len(label_lists.split())\
      - len(task_guidelines.split())\
      - len(input_section.split())
    
    
    # Provide few shots
    shots_text = ''
    for i in range(shots_df.shape[0]):
        sample = shots_df.iloc[i]
        example =f"""
Example {i + 1}:
Job Comment: {' '.join([sample[col] for col in config['input_fields']])}
Failure: parent label: {str(sample['Parent_Label'])}, child label: {str(sample['Child_Label'])}"""

        example_len = len(example.split())
        if example_len > remaining_len:
            break
        else:
            shots_text += example
            remaining_len -= example_len

    prompt_text = task_definiation + label_lists + task_guidelines + shots_text + input_section

    return prompt_text


### Inference

In [None]:

def completion(shots_df, elf_df, input_text, config):
    # infer the parenet node
    if config['model_type'] == 'alpha':
        prompt = create_labelling_prompt_alpha(shots_df, elf_df, input_text, config)
        return alpha_completion(prompt, config), prompt
    elif config['model_type'] == 'mistral':
        prompt = create_labelling_prompt_mistral(shots_df, elf_df, input_text, config, True)
        return  mistral_completion(prompt, config), prompt
    elif config['model_type'] == 'mixtral':
        prompt = create_labelling_prompt_mistral(shots_df, elf_df, input_text, config, True)
        return  mixtral_completion(prompt, config), prompt


def single_inference(input_sample, elf_df, labelled_data, config):
    """Function to start a single inference process

    Args:
        input_sample (dataframe): The dataframe row
        labelled_data (dataframe): The dataframe of labelled data, which is the source of shots examples
        config (dict): inference config

    Returns:
        Label, prediction, prompt
    """
    try: 
        input_text = ' '.join([input_sample[col] for col in config['input_fields']])
        # label = input_sample['Formatted_Label']

        # select few shots
        if config['shots_selects_method'] == 'random':
            shots_df = select_random_samples(labelled_data, config['shots_num'], 0)
        elif config['shots_selects_method'] == 'manual':
            shots_df= select_manual_samples(labelled_data, config['shots_indexes'])
        else:
            shots_df = select_similar_samples(input_text, labelled_data, config['tfidf_matrix'], config['vectorizer'], config['shots_num'])

        # filter the elf
        if config['elf_filter'] != None:
            filtered_elf_df = elf_df.copy(deep=True)
            if 'keywords' in config['elf_filter']:
                filtered_elf_df = filter_elf_by_keywords(input_text, elf_df)
            if 'tech' in config['elf_filter']:
                filtered_elf_df = filter_elf_by_tech_group(input_sample['TechGroup'], elf_df)

        # infer
        if config['model_type'] == 'alpha':
            prompt = create_labelling_prompt_alpha(shots_df, filtered_elf_df, input_text, config)
            return alpha_completion(prompt, config), prompt

        elif config['model_type'] == 'mistral':
            prompt = create_labelling_prompt_mistral(shots_df, filtered_elf_df, input_text, config, True)
            return  mistral_completion(prompt, config), prompt

        else:
            prompt = create_labelling_prompt_mistral(shots_df, filtered_elf_df, input_text, config, True)
            return  mixtral_completion(prompt, config), prompt
    except Exception as e:
        print(e)
        return e, ''


In [None]:
def single_inference_two_level(input_sample, elf_df, labelled_data, config):
    """Function to start a single inference process

    Args:
        input_sample (dataframe): The dataframe row
        labelled_data (dataframe): The dataframe of labelled data, which is the source of shots examples
        config (dict): inference config

    Returns:
        Label, prediction, prompt
    """
    input_text = ' '.join([input_sample[col] for col in config['input_fields']])
    label = input_sample[config['target_field']]

    # select few shots
    if config['shots_selects_method'] == 'random':
        shots_df = select_random_samples(labelled_data, config['shots_num'], 0)
    elif config['shots_selects_method'] == 'manual':
        shots_df= select_manual_samples(labelled_data, config['shots_indexes'])
    else:
        shots_df = select_similar_samples(input_text, labelled_data, config['tfidf_matrix'], config['vectorizer'], config['shots_num'])

    # filter the elf
    if config['elf_filter'] != None:
        filtered_elf_df = elf_df.copy(deep=True)
        if 'keywords' in config['elf_filter']:
            filtered_elf_df = filter_elf_by_keywords(input_text, elf_df)
        if 'tech' in config['elf_filter']:
            filter_elf_by_tech_group(input_sample['Parsed_TechGroup'], elf_df)
    
    elf_parents_df = filtered_elf_df[filtered_elf_df['Type'] == 'Parent'] 
    elf_children_df = filtered_elf_df[filtered_elf_df['Type'] == 'Child'] 

    # infer
    if config['model_type'] == 'alpha':
        prompt = create_labelling_prompt_alpha(shots_df, elf_parents_df, input_text, config)
        res = alpha_completion(prompt, config)
        parents_labels = process_prediction_label(res, list(elf_parents_df['Parsed_Label']))
        
        # infer the child labels
        prompt = create_labelling_prompt_alpha(shots_df, elf_children_df, input_text, config)
        res = alpha_completion(prompt, config)
        child_labels = process_prediction_label(res, list(elf_children_df['Parsed_Label']) )
     
        return parents_labels,  child_labels

    elif config['model_type'] == 'mistral':

        prompt = create_labelling_prompt_mistral_two_level(shots_df, elf_parents_df, input_text, config, False)

        return label, mistral_completion(prompt, config), prompt
    
    return parent_labels, child_res


### Post Process and Evaluation

In [None]:
def process_prediction_label(label_str, ontology_list):
    """Remove duplicated predictions and return only the ontology
    Args:
        label_str (str): A string representing the labels. Each label should be separated by a semicolon.
        label_list (list[str]): list of ontology label
    Returns:
        list[str]: Processed labels
    """
    # print(label_str)
    label_str = label_str.replace('.', '').replace('#', '').replace(':', '') 
    processed_labels = list(set([item.strip().lower() for item in label_str.split(';') if item.strip()]))
    processed_labels = [label for label in processed_labels if label.lower() in list(map(str.lower, ontology_list))]
    return list(processed_labels)

def extract_prediction_labels(label_str):
    if isinstance(label_str, str):
        parent_label = 'not_applicable'
        child_label = None
        parent_label_pattern = r'[Pp]arent label:\s*(\w+)'
        child_label_pattern = r'[Cc]hild label:\s*(\w+)'
        parent_label_match = re.search(parent_label_pattern, label_str)
        child_label_match = re.search(child_label_pattern, label_str)
        
        if parent_label_match:
            parent_label = parent_label_match.group(1).lower()
            if parent_label not in list(elf_parents_df['Parsed_Label']):
                parent_label = 'not_applicable'
            elif child_label_match:
                child_label = child_label_match.group(1).lower()
                if child_label not in elf_parents_df[elf_parents_df['Parsed_Label'] == parent_label]['Children'].iloc[0]:
                    child_label = None

        return parent_label, child_label
    else:
        return 'not_applicable', None


In [None]:
def get_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(
        y_true=y_true,
        y_pred=y_pred, 
        labels = list(set(y_true)))
    return cm.tolist()

def plot_confusion_matrix(cm, labels):
    sns.heatmap(cm, annot=True,fmt='d', cmap='YlGnBu', xticklabels=labels, yticklabels=labels)
    plt.ylabel('Prediction',fontsize=12)
    plt.xlabel('Actual',fontsize=12)
    plt.title('Confusion Matrix',fontsize=16)
    plt.show()

def get_classification_report(y_true, y_pred):
    # print(classification_report(y_true, y_pred))
    return classification_report(y_true, y_pred, output_dict=True)

def plot_report_tabel(report):
    table = []
    for key, value in report.items():
        if key not in ['accuracy', 'macro avg', 'weighted avg']:
            precision = "{:.2%}".format(value['precision'])
            recall = "{:.2%}".format(value['recall'])
            f1_score = "{:.2%}".format(value['f1-score'])
            support = int(value['support'])  # Support value left as-is
            table.append([key, precision, recall, f1_score, support])

    # Create a DataFrame from the processed data
    df = pd.DataFrame(table, columns=['Label', 'Precision', 'Recall', 'F1-Score', 'Support'])
    # Plot the DataFrame as a table
    fig, ax = plt.subplots(figsize=(10, 2))
    ax.axis('off')
    tbl = ax.table(cellText=df.values, colLabels=df.columns, loc='center')
    plt.show()

def evaluate_results(results):
    eval_res = []
    error_count = 0
    perfect_match_count = 0
    partial_match_count = 0

    i = 0
    for tgt_parent_label, tgt_child_label, pred_parent_label, pred_child_label, pred_str, prompt in results:
        i+=1
        match_res = ''
        if pred_str:
            if tgt_parent_label == pred_parent_label:
                if tgt_child_label == pred_child_label:
                    perfect_match_count += 1
                    partial_match_count += 1
                    match_res = 'perfect_match'
                else:
                    partial_match_count += 1
                    match_res = 'partial_match'
            else:
                match_res = 'no_match'
        else:
            match_res = 'error'
            error_count += 1

        eval_res.append([tgt_parent_label, tgt_child_label, pred_parent_label, pred_child_label, pred_str, match_res, prompt])

    total = len(results)

    perfect_match_rate_all = perfect_match_count / total
    partial_match_rate_all  = partial_match_count / total

    if total - error_count != 0:
        perfect_match_rate_valid  = perfect_match_count / (total - error_count)
        partial_match_rate_valid  = partial_match_count / (total - error_count)
    else:
        perfect_match_rate_valid  = 0
        partial_match_rate_valid  = 0
        
    print(f"Total predictions: {total}")
    print(f"Error predictions: {error_count}")
    print(f"Valid predictions: {total - error_count}")

    # print(perfect_match_count, partial_match_count)
    print(f"Perfect match rate (all): {perfect_match_rate_all * 100}%")
    print(f"Partial match rate (all): {partial_match_rate_all * 100}%")

    print(f"Perfect match rate (valid): {perfect_match_rate_valid * 100}%")
    print(f"Partial match rate (valid): {partial_match_rate_valid * 100}%")
    
    res_df = pd.DataFrame(eval_res, columns=['Target_Parent', 'Target_Child', 'Pred_Parent',  'Pred_Child',  'Pred', 'Result', 'Prompt'])
    y_true=res_df['Target_Parent']
    y_pred=res_df['Pred_Parent']
    
    eval_matrix = {
        'result': {
            'Perfect match rate (all)': perfect_match_rate_all, 
            'Partial match rate (all)': partial_match_rate_all, 
            'Perfect match rate (valid)': perfect_match_rate_valid, 
            'Partial match rate (valid)': partial_match_rate_valid
        }, 
        'report' : get_classification_report(y_true, y_pred),
        'confusion_matrix': get_confusion_matrix(y_true, y_pred)
    }
    
    plot_report_tabel(eval_matrix['report'])
    plot_confusion_matrix(eval_matrix['confusion_matrix'], list(set(y_true)))
    return eval_matrix, res_df


In [None]:
# save results
def save_dict_to_json(dict_data, file_path):
    try:
        # Remove specified fields from the dictionary
        for field in ['tfidf_matrix', 'vectorizer']:
            dict_data.pop(field, None)

        with open(file_path, 'w') as file:
            json.dump(dict_data, file, indent=4)
        return True
    except Exception as e:
        print(f"An error occurred while saving the dictionary data to JSON file: {e}")
        return False

def save_evaluation_results(training_set, eval_matrix, res_df, config):
    now = datetime.now() 
    timestamp = now.strftime("%Y%m%d_%H%M%S")  

    # Process evaluation results
    res_df = pd.concat([training_set, res_df], axis=1) 

    res_name = f"{config['model_name']}_{config['evaluation_method']}_{config['shots_selects_method']}_{config['elf_filter']}_{timestamp}"
    folder_path = os.path.join(config['results_folder'], res_name)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    file_path = os.path.join(folder_path, f'eval_results_{res_name}.xlsx')
    res_df.to_excel(file_path, index=False, engine='xlsxwriter')

    # Save config
    save_dict_to_json(config, os.path.join(folder_path, f'config_{res_name}.json'))

    # Save evaluation matrix
    save_dict_to_json(eval_matrix, os.path.join(folder_path, f'eval_matrix_{res_name}.json'))
    print(f'Evaluation results save to {folder_path}')


def save_evaluation_results_k_folds(folds, eval_matrix_list, eval_res_list, config):
    now = datetime.now() 
    timestamp = now.strftime("%Y%m%d_%H%M%S")  
    folder_path = os.path.join(config['results_folder'], f"{config['model_name']}_{config['evaluation_method']}_{config['shots_selects_method']}_{timestamp}")
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    res_df = pd.DataFrame()
    eval_matrix = {}
    for i in range(len(folds)):
        # Process evaluation results
        fold_res_df = pd.DataFrame(eval_res_list[i], columns=['Target', 'Pred',  'Processed_Pred', 'Result', 'Prompt'])
        fold_res_df = pd.concat([folds[i], fold_res_df], axis=1) 
        fold_res_df['fold'] = i
        res_df = pd.concat([res_df, fold_res_df], axis=0) 
        eval_matrix[f'fold_{i}'] = eval_matrix_list[i]

    # Compute the average acc
    eval_matrix["average"] = {
        'Perfect match rate (all)': np.mean([matrix["Perfect match rate (all)"] for matrix in eval_matrix_list]),
        'Partial match rate (all)': np.mean([matrix["Partial match rate (all)"] for matrix in eval_matrix_list]), 
        'Perfect match rate (valid)': np.mean([matrix["Perfect match rate (valid)"] for matrix in eval_matrix_list]), 
        'Partial match rate (valid)': np.mean([matrix["Partial match rate (valid)"] for matrix in eval_matrix_list]), 
    }
    
    file_path = os.path.join(folder_path, f'eval_results_{timestamp}.xlsx')
    res_df.to_excel(file_path, index=False, engine='xlsxwriter')

    # Save config
    save_dict_to_json(config, os.path.join(folder_path, f'config_{timestamp}.json'))

    # Save evaluation matrix
    save_dict_to_json(eval_matrix, os.path.join(folder_path, f'eval_matrix_{timestamp}.json'))
    print(f'Evaluation results save to {folder_path}')


# Start the Process

In [None]:

def inference_evaluation_process(samples_df, shots_df, config):
    """Function to get all prediction for given datasets
    """
    
    predictions = []
    for i in range(len(samples_df)):
        sample = samples_df.iloc[i]
        # try: 
        pred, prompt = single_inference(sample, elf_df, shots_df, config)
        pred_parent, pred_child = extract_prediction_labels(pred)
        print(i, sample['Parent_Label'], sample['Child_Label'], pred_parent, pred_child)
        predictions.append((sample['Parent_Label'], sample['Child_Label'], pred_parent, pred_child, pred, prompt))
        # except Exception as e:
        #     predictions.append((sample['Parent_Label'], sample['Child_Label'], None, None, '', e))
        #     print(f'{i} Error getting predictions: {e}')
    return predictions

def trigger_inference_evaluation_process_single(df, config):
    # Select the sample data
    # sample_data = processed_training_data.iloc[:training_config['training_data_size']]
    training_set, remaining_df = select_samples_from_label_groups(df, config)
    # Check data quality
    print(f'Training Data Quality: ')
    quality_results = evaluate_dataset_quality(training_set, config['input_fields'])
    print(quality_results)

    # Compute TF-IDF vectors for the combined text and existing samples
    if config['shots_selects_method'] == 'similar':
        vectorizer, tfidf_matrix = compute_tfidf_matrix(remaining_df, config['input_fields'])
        config['vectorizer'] = vectorizer
        config['tfidf_matrix'] = tfidf_matrix
        
    # Start the inference process
    print(f'Start the Inference Process ')
    predictions = inference_evaluation_process(training_set, remaining_df, config)
    
    # Evaluation
    print(f'Evaluation Results: ')
    eval_matrix, eval_res = evaluate_results(predictions)

    # Save results
    save_evaluation_results(training_set, eval_matrix, eval_res, config)

def trigger_inference_evaluation_process_k_folds(df, config):
    # Select the sample data
    folds, remaining_df = create_non_intersecting_sample_sets(df, config['fold_sample_size'], config['folds_num'])
    # Compute TF-IDF vectors for the combined text and existing samples
    if config['shots_selects_method'] == 'similar':
        vectorizer, tfidf_matrix = compute_tfidf_matrix(remaining_df, config['input_fields'])
        config['vectorizer'] = vectorizer
        config['tfidf_matrix'] = tfidf_matrix
    
    eval_matrix_list = []
    eval_res_list = []
    for i, fold in enumerate(folds):
        print(f'##################### Fold {i}##########################3')
        print(f'Training Data Quality: ')
        quality_results = evaluate_dataset_quality(fold, config['input_fields'])
        print(quality_results)
        
        # Start the inference process
        print(f'Start the Inference Process ')
        predictions = inference_evaluation_process(fold, remaining_df, config)
    
        # Evaluation
        print(f'Evaluation Results: ')
        eval_matrix, eval_res = evaluate_results(predictions)
        eval_matrix_list.append(eval_matrix)
        eval_res_list.append(eval_res)
    # # Save results
    save_evaluation_results_k_folds(folds, eval_matrix_list, eval_res_list, config)

def trigger_inference_evaluation_process(config):
    print(config)
    # Load the training data
    full_training_data = load_full_dataset(config)

    # # find valid labels
    # config['valid_labels'] = find_valid_labels(full_training_data, elf_df)
     
    # Check data quality
    print(f'Training Data (Full Set) Quality: ')
    quality_results = evaluate_dataset_quality(full_training_data, config['input_fields'])
    print(quality_results)    

    if config['evaluation_method'] == 'single':
        trigger_inference_evaluation_process_single(full_training_data, config)
    else:
        trigger_inference_evaluation_process_k_folds(full_training_data, config)


In [None]:
# luminous-base-control-20240215
# luminous-extended-control-20240215
# luminous-supreme-control-20240215


ALPHA_CONFIG = {
    'training_data_folder': r'.\training_data',
    'training_data_name': 'preprocessed_training_data_20240521_134540.pkl',
    'training_data_size': 10,
    'training_data_random_state': 0,
    'evaluation_method': 'single', #['folds', 'single']
    'folds_num': 5,
    'fold_sample_size':300,
    'model_type': 'alpha',
    "model_name" : "luminous-base",
    "elf_filter": ['tech'], #['keywords', 'tech']
    "shots_num" : 3,
    "shots_selects_method": 'similar', #['random', 'manual', 'similar']
    "shots_indexes": [0, 6, 20],
    "random_state" : 1,
    "max_tokens": 20,
    "max_prompt_len": 800,
    "temperature": 0.1,
    "input_fields": ['Processed_JobComment', 'Processed_JobSummary'],
    "target_field": 'ElPred',
    'results_folder': r'.\labelling_results'
}

MISTRAL_CONFIG = {
    'training_data_folder': r'.\training_data',
    'training_data_name': 'preprocessed_training_data_20240521_134540.pkl',
    'training_data_size': 300,
    'training_data_random_state': 0,
    'evaluation_method': 'single', #['folds', 'single']
    'folds_num': 2,
    'fold_sample_size':100,
    'model_type': 'mistral',
    "model_name" : "Mistral-7B-Instruct",
    "elf_filter": ['tech'], #['keywords', 'tech'], None
    "shots_num" : 5,
    "shots_selects_method": 'similar', #['random', 'manual', 'similar']
    "shots_indexes": [0, 6, 20],
    "random_state" : 1,
    "max_tokens": 20,
    "max_prompt_len": 8000,
    "temperature": 0.1,
    "input_fields": ['Processed_JobComment', 'Processed_JobSummary'],
    "target_field": 'ElPred',
    'results_folder': r'.\labelling_results'
}


MIXTRAL_CONFIG = {
    'training_data_folder': r'.\training_data',
    'training_data_name': 'preprocessed_training_data_20240521_134540.pkl',
    'training_data_size': 150,
    'training_data_random_state': 0,
    'evaluation_method': 'single', #['folds', 'single']
    'folds_num': 2,
    'fold_sample_size':100,
    'model_type': 'mistral',
    "model_name" : "Mistral-7B-Instruct",
    "elf_filter": ['tech'], #['keywords', 'tech'], None
    "shots_num" : 5,
    "shots_selects_method": 'similar', #['random', 'manual', 'similar']
    "shots_indexes": [0, 6, 20],
    "random_state" : 1,
    "max_tokens": 20,
    "max_prompt_len": 8000,
    "temperature": 0.1,
    "input_fields": ['Processed_JobComment', 'Processed_JobSummary'],
    "target_field": 'ElPred',
    'results_folder': r'.\labelling_results'
}

In [None]:
%%time
trigger_inference_evaluation_process(MIXTRAL_CONFIG)


# Evaluate Process

In [None]:
config ={
    'training_data_folder': r'.\training_data',
    'training_data_name': 'preprocessed_training_data_20240521_134540.pkl',
    'training_data_size': 150,
    'training_data_random_state': 0,
    'evaluation_method': 'single', #['folds', 'single']
    'folds_num': 2,
    'fold_sample_size':100,
    'model_type': 'mixtral',
    "model_name" : "llama3-70b",
    "elf_filter": [], #['keywords', 'tech'], None
    "shots_num" : 8,
    "shots_selects_method": 'similar', #['random', 'manual', 'similar']
    "shots_indexes": [0, 6, 20],
    "random_state" : 1,
    "max_tokens": 20,
    "max_prompt_len": 12000,
    "temperature": 0.1,
    "input_fields": ['Processed_JobComment', 'Processed_JobSummary'],
    "target_field": 'ElPred',
    'results_folder': r'.\labelling_results'
}

In [None]:
full_training_data = load_full_dataset(config)
quality_results = evaluate_dataset_quality(full_training_data, config['input_fields'])
print(quality_results)  

In [None]:
len(full_training_data['Formatted_Label'].value_counts())

In [None]:
training_set, remaining_df = select_samples_from_label_groups(full_training_data, config)


In [None]:

quality_results = evaluate_dataset_quality(training_set, config['input_fields'])
print(quality_results)

In [None]:
# Compute TF-IDF vectors for the combined text and existing samples
if config['shots_selects_method'] == 'similar':
    vectorizer, tfidf_matrix = compute_tfidf_matrix(remaining_df, config['input_fields'])
    config['vectorizer'] = vectorizer
    config['tfidf_matrix'] = tfidf_matrix

In [None]:
predictions = inference_evaluation_process(training_set, remaining_df, config)
    

In [None]:
eval_matrix, res_df = evaluate_results(predictions)
save_evaluation_results(training_set, eval_matrix, res_df, config)

In [None]:
# classification_report(
#     res_df['Target_Parent'], 
#     res_df['Pred_Parent'].fillna(value='not_applicable'), output_dict = True)


# def evaluation_matrix(y_true, y_pred):
#     matrix = precision_recall_fscore_support(
#     res_df['Target_Parent'], 
#     res_df['Pred_Parent'].fillna(value='not_applicable'),
#     labels=list(elf_parents_df[elf_parents_df['Parsed_Label'] != 'probing']['Parsed_Label']))
# print()

# Testing

In [None]:
training_data_path = os.path.join(ALPHA_CONFIG['training_data_folder'], ALPHA_CONFIG['training_data_name'])
full_set = pd.read_pickle(training_data_path)

In [None]:
elf_df

In [None]:
full_set[TRAIN_SET_COLS].info()

In [None]:
full_set['Projectnr']

In [None]:
quality_results = evaluate_dataset_quality(full_set, ['Processed_JobSummary', 'Processed_JobComment'], ["Processed_JobSummary_Lang", "Processed_JobComment_Lang"])
quality_results

In [None]:
processed_full_set = process_dataset(full_set, ALPHA_CONFIG)
quality_results = evaluate_dataset_quality(processed_full_set, ['Processed_JobSummary', 'Processed_JobComment'], ["Processed_JobSummary_Lang", "Processed_JobComment_Lang"])
quality_results

In [None]:
processed_full_set.info()

In [None]:
sum(processed_full_set['Is_Valid_Label'])       

In [None]:
unlabelled_df = pd.read_pickle(os.path.join(ALPHA_CONFIG['training_data_folder'], "preprocessed_unlabelled_data_mc_20240521_134540.pkl"))
quality_results = evaluate_dataset_quality(unlabelled_df, ['Processed_JobSummary', 'Processed_JobComment'], ["Processed_JobSummary_Lang", "Processed_JobComment_Lang"])
quality_results

In [None]:
unlabelled_df.info()

In [None]:
combined_labels

In [None]:
df_unlabelled = pd.DataFrame([[label, 0] for label in combined_labels if label not in label_counts.index]).set_index(0)
df_unlabelled

In [None]:
label_counts

In [None]:
df_unlabelled.columns = pd.DataFrame(label_counts).columns
df_unlabelled
label_counts_df = pd.concat([pd.DataFrame(label_counts), df_unlabelled], axis=0)
label_counts_df

In [None]:
label_counts = training_set['Formatted_Label'].value_counts()

# Print the element counts
# print(label_counts)
print(len(label_counts))

plt.figure(figsize=(20, 6))
bars = label_counts['count'].plot(kind='bar', color='skyblue')
plt.title('ElPred Categories and Counts (Formatted_Label)')
plt.xlabel('ElPred Categories')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
# Add annotations above each bar
for bar in bars.patches:
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), bar.get_height(), ha='center', va='bottom')
plt.show()

In [None]:
len(set(processed_full_set['Formatted_Label']))

In [None]:
label_counts = training_set['Formatted_Label'].value_counts()

# Print the element counts
# print(label_counts)
print(len(label_counts))

plt.figure(figsize=(20, 6))
bars = label_counts.plot(kind='bar', color='skyblue')
plt.title('ElPred Categories and Counts (Formatted label)')
plt.xlabel('ElPred Categories')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
# Add annotations above each bar
for bar in bars.patches:
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), bar.get_height(), ha='center', va='bottom')
plt.show()

In [None]:
label_counts = processed_full_set['ElPred_List'].explode().value_counts()

# Print the element counts
# print(label_counts)
print(len(label_counts))


plt.figure(figsize=(20, 6))
bars = label_counts.plot(kind='bar', color='skyblue')
plt.title('ElPred Label Categories and Counts')
plt.xlabel('ElPred Label Categories')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
# Add annotations above each bar
for bar in bars.patches:
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), bar.get_height(), ha='center', va='bottom')
plt.show()

In [None]:
sample_text =  " ".join([full_set[field].iloc[0] for field in ALPHA_CONFIG['input_fields']])
sample_text


In [None]:
vectorizer, tfidf_matrix = compute_tfidf_matrix(remaining_set, ALPHA_CONFIG['input_fields'])


In [None]:
sample = training_set[['Projectnr','Processed_JobComment','Processed_JobSummary', 'ElPred', 'TechGroup']].loc[150]
sample

In [None]:
sample['Processed_JobSummary']

In [None]:
input_text = ' '.join([sample[col] for col in ALPHA_CONFIG['input_fields']])
input_text

In [None]:
select_similar_samples(input_text, remaining_set, tfidf_matrix, vectorizer, 5)[['Projectnr','Processed_JobComment','Processed_JobSummary', 'ElPred', 'Similarity']]


In [None]:

# df = pd.read_csv("data.csv")

In [None]:
get_all_tech_groups()

In [None]:
# print(create_prompt_labelling_alpha(processed_training_data.iloc[[0, 6, 15]], 'insert job comment here'))
print(create_labelling_prompt_alpha(select_random_samples(processed_full_set, training_config['shots_num'], training_config['random_state'] ),
                                    processed_full_set.iloc[0]['Processed_JobSummary'], 
                                    training_config))

In [None]:
full_set[full_set['ElPred']== 'undefined;']

In [None]:
valid_full_set = validate_labels(full_set)
valid_full_set[valid_full_set['ElPred']== 'undefined;']


In [None]:
full_set.columns


In [None]:
full_set['TechGroup_2']

In [None]:
# full_set['TechGroup_2'].info()
full_set.dropna(subset=['TechGroup_2']).info()

In [None]:
get_all_tech_groups()


In [None]:
filter_elf_by_keywords('scan;scan test;	', elf_df)


In [None]:

    
elf_parent_df = find_elf_groups(graph)

In [None]:
elf_parent_df

In [None]:
# find_elf_children(graph, ['ElFaultPin'])['Name']
find_elf_children(graph,list(elf_parent_df['Name']))['Name']

In [None]:
find_all_elf(graph)

In [None]:
parent_df = find_elf_parent_nodes(graph)


In [None]:
def combine_elf_label(parent_df):
    combined_df = pd.DataFrame()  
    
    for group in parent_df['Name']:
        children_df = find_elf_children(graph, [group]) 
        children_df['Parent_Label'] = parent_df.loc[parent_df['Name'] == group, 'Parsed_Label'].iloc[0]
        children_df.rename(columns={'Parsed_Label': 'Children_Label'}, inplace=True)
        children_df['Combined_Label'] = children_df.apply(lambda row: f"{row['Parent_Label']};{row['Children_Label']}", axis=1)
        combined_df = pd.concat([combined_df, children_df], ignore_index=True)
        combined_df = combined_df[['Parent_Label', 'Children_Label', 'Combined_Label']]
    
    combined_df = combined_df[['Parent_Label', 'Children_Label', 'Combined_Label']].applymap(lambda x: x.lower() if isinstance(x, str) else x)
    return combined_df

# indicate elf_parent_df
combined_df = combine_elf_label(elf_parents_df)


In [None]:


def find_elf_combined_labels(elf_df):
  parents_df = elf_df[elf_df['Type'] == 'Parent']
  combined_labels_all = []

  for i in range(len(parents_df)):
    combined_labels = [parents_df.iloc[i]['Parsed_Label']]
    for child in parents_df.iloc[i]['Children']:
      combined_labels.append(parents_df.iloc[i]['Parsed_Label']+';'+child)
  
    combined_labels_all.append(combined_labels)
  return combined_labels_all

In [None]:
combined_labels_groups = find_elf_combined_labels(elf_df)

combined_labels = [label for labels in combined_labels_groups for label in labels]
len(combined_labels)

In [None]:

elf_df, elf_parents_df, elf_children_df = load_elf_df(graph)

In [None]:
elf_df.info()

In [None]:
elf_df.info()

In [None]:
elf_df

In [None]:
full_set.columns


In [None]:
prompt = create_labelling_prompt_alpha(processed_full_set.head(), elf_df, 'input_text', ALPHA_CONFIG)


In [None]:
print(prompt)


In [None]:
import requests
import json
ALPHA_TOKEN = os.environ.get('ALPHA_TOKEN')
emb_url = os.environ.get('EMB_URL')
emb_model = "luminous-base"
payload = json.dumps({
  "model": emb_model,
  "prompt": "An apple a day keeps the doctor away.",
  "layers": [
    0,
    1
  ],
  "tokens": False,
  "pooling": [
    "max"
  ],
  "type": "symmetric"
})
headers = {
  'Content-Type': 'application/json',
  'Accept': 'application/json',
  'cookie': 'token='+ALPHA_TOKEN
}

response = requests.request("POST", emb_url, headers=headers, data=payload,  verify=False)

print(response.text)

In [None]:
model_config = {
    'model_name':'mixtral',
    'max_tokens':50,
    'temperature': 0.1
}
mistral_completion('testing', model_config)

In [None]:
extract_prediction_labels('Parent label: currentconsumption\n\
Child label: None')

In [None]:
elf_df[elf_df['Type'] == 'Child'].iloc[0]

In [None]:
get_all_tech_groups()

In [None]:
full_set[full_set['Projectnr'] == 'MA23AP-02146']
# filter_elf_by_keywords()

In [None]:
filter_elf_by_tech_group('TechGroupMixed', elf_df)


In [None]:
elf_df

# Demo

In [None]:
config ={
    'training_data_folder': r'.\training_data',
    'training_data_name': 'preprocessed_training_data_20240521_134540.pkl',
    'training_data_size': 150,
    'training_data_random_state': 0,
    'evaluation_method': 'single', #['folds', 'single']
    'folds_num': 2,
    'fold_sample_size':100,
    'model_type': 'mixtral',
    "model_name" : "gpt-3.5-turbo",
    "elf_filter": [], #['keywords', 'tech'], None
    "shots_num" : 5,
    "shots_selects_method": 'similar', #['random', 'manual', 'similar']
    "shots_indexes": [0, 6, 20],
    "random_state" : 1,
    "max_tokens": 20,
    "max_prompt_len": 8000,
    "temperature": 0.1,
    "input_fields": ['Processed_JobComment', 'Processed_JobSummary'],
    "target_field": 'ElPred',
    'results_folder': r'.\labelling_results'
}

In [None]:
full_training_data = load_full_dataset(config)
training_set, remaining_df = select_samples_from_label_groups(full_training_data, config)
# Compute TF-IDF vectors for the combined text and existing samples
if config['shots_selects_method'] == 'similar':
    vectorizer, tfidf_matrix = compute_tfidf_matrix(remaining_df, config['input_fields'])
    config['vectorizer'] = vectorizer
    config['tfidf_matrix'] = tfidf_matrix

In [None]:
sample_input = training_set.loc[80]
sample_input


In [None]:
res = single_inference(sample_input, elf_df, remaining_df, config)
print('Model Response: ', res[0])
print('Predicted Lables: ', extract_prediction_labels(res[0]))
print('Prompt: ', res[1])

In [None]:
# filter elf
filtered_elf_df = filter_elf_by_tech_group(sample_input['Parsed_TechGroup'], elf_df)
filtered_elf_df


In [None]:
prompt = create_labelling_prompt_mistral(remaining_df, filtered_elf_df, 'input_text', config, True)
mixtral_completion(prompt, config), prompt

In [None]:
full_set
