In [3]:
import os
import sys
import pandas as pd
import pyarrow.ipc as ipc
from dotenv import load_dotenv

# Load environment variables
load_dotenv("../.env")

# Load csv file and convert to pandas DataFrame
df = pd.read_csv(".\data\slm-labels.csv").head(100)

# Prepare test dataset
df_test_dataset = df.copy()
df_test_dataset.drop(columns=['image'], inplace=True)
df_test_dataset['user_input'] = df_test_dataset['analysis_gpt-4o-2024-11-20_v1class'].astype(str)
df_test_dataset['severity_correct'] = (df_test_dataset['severity'] == df_test_dataset['pred_severity_phi']).astype('int64')
df_test_dataset['classification_correct'] = (df_test_dataset['classification'] == df_test_dataset['pred_classification_phi']).astype('int64')
df_test_dataset.to_json('./test-dataset.json', orient='records', indent=4)

# Add source path and import Featurizer
sys.path.append("lmshap")
from genaishap import Featurizer
from openai import AzureOpenAI

# Create features using Featurizer
featurizer = Featurizer.from_pandas(df_test_dataset)

# Initialize Azure OpenAI client
client = AzureOpenAI(
    azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'],
    api_key=os.environ['OPENAI_API_KEY'],
    api_version=os.environ['OPENAI_API_VERSION'],
)

# Generate features using Azure OpenAI
featurizer.create_features_using_azure_openai(
    deployment_name=os.environ['DEPLOYMENT_NAME'],
    num_features=10
)
print(featurizer.features.model_dump_json(indent=4))

# Fill out features
featurizer.fill_out_features_using_azure_openai(
    deployment_name="gpt-4o",
    batch_size=5
)

# Export features to JSON
df_features = featurizer.to_pandas()
df_features.to_json('data/test-features.json', orient='records', indent=4)


  df = pd.read_csv(".\data\slm-labels.csv").head(100)


{
    "features": [
        {
            "feature": "there_is_any_train_station_identified_in_the_question",
            "ftype": "boolean"
        },
        {
            "feature": "list_of_train_station_features_identified_in_the_question",
            "ftype": "list_of_strings"
        },
        {
            "feature": "there_is_any_train_identified_in_the_question",
            "ftype": "boolean"
        },
        {
            "feature": "list_of_train_features_identified_in_the_question",
            "ftype": "list_of_strings"
        },
        {
            "feature": "there_is_any_platform_identified_in_the_question",
            "ftype": "boolean"
        },
        {
            "feature": "list_of_platform_features_identified_in_the_question",
            "ftype": "list_of_strings"
        },
        {
            "feature": "there_is_any_people_or_passengers_mentioned",
            "ftype": "boolean"
        },
        {
            "feature": "list_of_people_related

100%|██████████| 20/20 [01:02<00:00,  3.10s/it]
