In [59]:
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback
import re

import numpy as np

import warnings
warnings.filterwarnings('ignore')


class Classifier():
    def __init__(
        self, 
        llm_model_name, 
        params
        ):
        self.llm_model_name = llm_model_name
        self.llm_model = OpenAI(temperature=0, model_name = self.llm_model_name)

        #self.llm_model = llm_model,
        self.columns_name = params['columns_name']

        self.model_code = None

    def fit(self, x, y, model_name, file_path=None):
        print("> Start of model creating.")
        df = x.copy()

        df['target'] = y

        # Determine whether binary or multivalued classification is used
        if len(df['target'].unique()) == 2:
            task_type = 'binary classification'
            output_code = 'y = 1 / (1 + np.exp(-y))'
        else:
            task_type = 'multi-class classification'

        # Obtaining data types
        data_type = ', '.join(df.dtypes.astype(str))



        # Create a string dataset
        dataset = []
        for index, row in df.iterrows():
            row_as_str = [str(item) for item in row.tolist()] 
            dataset.append(','.join(row_as_str))
        dataset_str = '\n'.join(dataset)


        # column name
        if self.columns_name:
            col_name = ', '.join(df.columns.astype(str))
            col_option = ''

        else:
            # serial number
            df.columns = range(df.shape[1])
            col_name = ', '.join(df.columns.astype(str))
            col_option = 'df.columns = range(df.shape[1])'



        create_prompt = """
        Please create your code in compliance with all of the following conditions. Output should be code only. Do not enclose the output in ``python ``` or the like.
        ・Analyze the large amount of data below and create a {task_type_} code to accurately predict "target".
        ------------------
        {dataset_str_}
        ------------------
        ・Each data type is as follows. If necessary, you can change the data type.
        ・Create code that can make predictions about new data based on logic from large amounts of input data without using machine learning models.
        ・If input is available, the column names below should also be used to help make decisions when creating the predictive model. Column Name:{col_name_}
        ・Create a code like the following. Do not change the input or output format.
        ・If {col_option_} is not blank, add it after 'df = x.copy()'.
        ・You do not need to provide examples.
        ------------------
        import numpy as np

        def predict(x):
            df = x.copy()

            output = []
            for index, row in df.iterrows():


                # Feature creation and data preprocessing


                {output_code_}
                output.append(y)

            output = np.array(output)
                
            return output
        """.format(
            task_type_ = task_type,
            dataset_str_ = dataset_str,
            model_name_ = model_name,
            col_name_ = col_name,
            col_option_ = col_option,
            output_code_ = output_code
            )

        #print(create_prompt)

        with get_openai_callback() as cb:
            model_code = self.llm_model(create_prompt)
            print(cb)


        # Save to File
        if file_path != None:
            with open(file_path + f'{model_name}.py', mode='w') as file:
                file.write(model_code)


        self.model_code = model_code

        return model_code

    def predict(self, x):
        if self.model_code is None:
            raise Exception("You must train the model before predicting!")

        code = self.model_code

        # = re.search(r'def (\w+)', function_string).group(1)
        #code = self.model_code + '\n'# + f'model = model({x})'
        exec(code, globals())

        #model = namespace["code"]
        
        y = predict(x)

        return y




    def interpret(self):
        if self.model_code is None:
            raise Exception("You must train the model before interpreting!")

        interpret_prompt = """
        Refer to the code below and explain how you are going to process the data and make predictions.
        The only part to explain is the part where the data is processed.
        Do not explain df = x.copy().
        Please output the data in bulleted form.
        Please tell us what you can say based on the whole process.
        ------------------
        {model_code_}
        """.format(
            model_code_ = self.model_code
        )

        with get_openai_callback() as cb:
            output = self.llm_model(interpret_prompt)
            print(cb)


        return output

# Training

In [60]:
import pandas as pd

from langchain.llms import OpenAI

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [61]:
df = pd.read_csv('../datasets/learning.csv')
x_train = df.drop('survived', axis=1)
y_train = df['survived']

In [62]:
llm_model_name = 'gpt-4'

params = {'columns_name': False}

ibl = Classifier(llm_model_name=llm_model_name, params=params)

In [63]:
model = ibl.fit(x_train, y_train, model_name = 'titanic', file_path='./model_code/')

> Start of model creating.
Tokens Used: 3698
	Prompt Tokens: 3436
	Completion Tokens: 262
Successful Requests: 1
Total Cost (USD): $0.1188


In [64]:
print(model)

import numpy as np
import pandas as pd

def predict(x):
    df = x.copy()
    df.columns = range(df.shape[1])

    output = []
    for index, row in df.iterrows():

        # Feature creation and data preprocessing
        pclass = row[7]
        sex = row[1]
        age = row[2]
        fare = row[5]
        embarked = row[6]
        alone = row[13]

        # Prediction logic
        y = 0

        if pclass == 'First':
            y += 0.3
        elif pclass == 'Second':
            y += 0.1

        if sex == 'female':
            y += 0.35

        if age <= 16:
            y += 0.2
        elif age > 16 and age <= 32:
            y += 0.1

        if fare > 50:
            y += 0.1

        if embarked == 'C':
            y += 0.1

        if alone == 'True':
            y -= 0.1

        y = 1 / (1 + np.exp(-y))
        output.append(y)

    output = np.array(output)

    return output


## Prediction

In [69]:
df = pd.read_csv('../datasets/pred.csv')
x_test = df.drop('survived', axis=1)
y_test = df['survived']

In [70]:
y_proba = ibl.predict(x_test)
y_pred = (y_proba > 0.5).astype(int)

In [71]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

# Recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall}')

# F1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 score: {f1}')

# ROC-AUC (you need prediction probabilities for this, not just class predictions)
# Here we just reuse y_pred for simplicity
roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC-AUC: {roc_auc}')

Accuracy: 0.54
Precision: 0.43902439024390244
Recall: 1.0
F1 score: 0.6101694915254238
ROC-AUC: 0.9097222222222222


## Prediction from external files


In [68]:
import titanic

y_proba = titanic.predict(x_test)
y_pred = (y_proba > 0.5).astype(int)


In [56]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

# Recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall}')

# F1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 score: {f1}')

# ROC-AUC (you need prediction probabilities for this, not just class predictions)
# Here we just reuse y_pred for simplicity
roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC-AUC: {roc_auc}')

Accuracy: 0.83
Precision: 0.7435897435897436
Recall: 0.8055555555555556
F1 score: 0.7733333333333334
ROC-AUC: 0.9266493055555556


## Interpretation of results

In [57]:
description = ibl.interpret()

Tokens Used: 881
	Prompt Tokens: 537
	Completion Tokens: 344
Successful Requests: 1
Total Cost (USD): $0.036750000000000005


In [58]:
print(description)

- Data preprocessing:
    - Fill missing 'age' values with the median age.
    - Fill missing 'fare' values with the median fare.
    - Fill missing 'embarked' values with the mode (most frequent) of the 'embarked' column.

- Feature creation:
    - Create a new binary feature 'is_female' based on the 'sex' column.
    - Create a new binary feature 'is_child' based on the 'age' column.
    - Create a new binary feature 'is_adult_male' based on the 'adult_male' column.
    - Create a new binary feature 'is_alone' based on the 'alone' column.
    - Create new binary features 'is_first_class', 'is_second_class', and 'is_third_class' based on the 'pclass' column.
    - Create new binary features 'embarked_C', 'embarked_Q', and 'embarked_S' based on the 'embarked' column.

- Prediction logic:
    - Initialize a variable 'y' to 0.
    - Add or subtract weights to 'y' based on the created binary features.
    - Apply the logistic function (sigmoid) to 'y' to get the final prediction.
    - Ap