In [12]:
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback
import re

import numpy as np

import warnings
warnings.filterwarnings('ignore')


class Classification():
    def __init__(
        self, 
        llm_model_name, 
        params
        ):
        self.llm_model_name = llm_model_name
        self.llm_model = OpenAI(temperature=0, model_name = self.llm_model_name)

        #self.llm_model = llm_model,
        self.columns_name = params['columns_name']

        self.model_code = None

    def train(self, x, y, model_name, file_path=None):
        print("> Start of model training.")
        df = x.copy()

        # yのデータをtargetとしてxのdataframeに加える
        df['target'] = y

        # 二値分類か多値分類かを判定
        if len(df['target'].unique()) == 2:
            task_type = 'binary classification'
            output_code = 'y = 1 / (1 + np.exp(-y))'
        else:
            task_type = 'multi-class classification'

        # データ型の取得
        data_type = ', '.join(df.dtypes.astype(str))



        # 文字列のdatasetを作成
        dataset = []
        for index, row in df.iterrows():
            row_as_str = [str(item) for item in row.tolist()]  # 各要素を文字列に変換
            dataset.append(','.join(row_as_str))

        # リスト全体を改行文字で結合
        dataset_str = '\n'.join(dataset)


        # ハイパーパラメータの設定
        if self.columns_name:
            col_name = ', '.join(df.columns.astype(str))

        else:
            col_name = None

        create_prompt = """
        Please create your code in compliance with all of the following conditions. Output should be code only. Do not enclose the output in ``python ``` or the like.
        ・Analyze the large amount of data below and create a {task_type_} code to accurately predict "target".
        ------------------
        {dataset_str_}
        ------------------
        ・Each data type is as follows. If necessary, you can change the data type.
        ・Create code that can make predictions about new data based on logic from large amounts of input data without using machine learning models.
        ・If input is available, the column names below should also be used to help make decisions when creating the predictive model. Column Name:{col_name_}
        ・Create a code like the following. Do not change the input or output format.
        ・Use {model_name_} for the function name.
        ・You do not need to provide examples.
        ------------------
        import numpy as np

        def model(x):
            df = x.copy()

            output = []
            for index, row in df.iterrows():


                # Feature creation and data preprocessing


                {output_code_}
                output.append(y)
                
            return output
        """.format(
            task_type_ = task_type,
            dataset_str_ = dataset_str,
            model_name_ = model_name,
            col_name_ = col_name,
            output_code_ = output_code
            )

        #print(create_prompt)

        with get_openai_callback() as cb:
            model_code = self.llm_model(create_prompt)
            print(model_code)
            print(cb)


        # Save to File
        if file_path != None:
            with open(file_path + f'{model_name}.py', mode='w') as file:
                file.write(model_code)


        self.model_code = model_code

        return model_code

    def predict(self, x):
        if self.model_code is None:
            raise Exception("You must train the model before predicting!")


        #code = self.model_code + '\n' + f'model = model({x})'
        exec(self.model_code, globals())

        #model = namespace["code"]
        
        y = model(x)

        return np.array(y)




    def interpret(self):
        if self.model_code is None:
            raise Exception("You must train the model before interpreting!")

        interpret_prompt = """
        Refer to the code below and explain how you are going to process the data and make predictions.
        The only part to explain is the part where the data is processed.
        Do not explain df = x.copy().
        Please output the data in bulleted form.
        Please tell us what you can say based on the whole process.
        ------------------
        {model_code_}
        """.format(
            model_code_ = self.model_code
        )

        with get_openai_callback() as cb:
            output = self.llm_model(interpret_prompt)
            print(cb)


        return output

# Training

In [13]:
import pandas as pd

from langchain.llms import OpenAI

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [14]:
df = pd.read_csv('../data/learning.csv')
x_train = df.drop('survived', axis=1)
y_train = df['survived']

In [15]:
llm_model_name = 'gpt-4'

params = {'columns_name': True}

ibl = Classification(llm_model_name=llm_model_name, params=params)

In [16]:
model = ibl.train(x_train, y_train, model_name = 'titanic', file_path='./')

> Start of model training.
import numpy as np
import pandas as pd

def titanic(data):
    df = pd.DataFrame(data, columns=["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "Deck", "Title", "Alone", "Cabin", "Embark_town", "Alive", "IsAlone", "Target"])

    output = []
    for index, row in df.iterrows():

        # Feature creation and data preprocessing
        sex = 1 if row["Sex"] == "male" else 0
        age = row["Age"]
        if pd.isna(age):
            age = 30
        fare = row["Fare"]
        if pd.isna(fare):
            fare = 14
        pclass = row["Pclass"]
        sibsp = row["SibSp"]
        parch = row["Parch"]
        embarked = 0
        if row["Embarked"] == "C":
            embarked = 1
        elif row["Embarked"] == "Q":
            embarked = 2

        # Prediction logic
        y = -1.5 + 0.1 * sex - 0.02 * age + 0.15 * fare - 0.5 * pclass + 0.3 * sibsp - 0.1 * parch + 0.2 * embarked

        y = 1 / (1 + np.exp(-y))
        output.append(y)
 

## Prediction

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [18]:
df = pd.read_csv('../data/pred.csv')
x_test = df.drop('survived', axis=1)
y_test = df['survived']

In [19]:
y_proba = ibl.predict(x_test)
y_pred = (y_proba > 0.5).astype(int)

TypeError: 'str' object is not callable

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

# Recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall}')

# F1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 score: {f1}')

# ROC-AUC (you need prediction probabilities for this, not just class predictions)
# Here we just reuse y_pred for simplicity
roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC-AUC: {roc_auc}')

Accuracy: 0.41
Precision: 0.37894736842105264
Recall: 1.0
F1 score: 0.549618320610687
ROC-AUC: 0.9309895833333334


## Prediction from external files


In [99]:
from model import model

y_proba = model(x_test)


## Interpretation of results

In [100]:
description = ibl.interpret()

Tokens Used: 1065
	Prompt Tokens: 552
	Completion Tokens: 513
Successful Requests: 1
Total Cost (USD): $0.04733999999999999


In [101]:
print(description)

The data processing and prediction steps in the given code can be explained as follows:

- Preprocessing:
    - Convert the 'sex' column to numerical values: male = 0, female = 1
    - Fill missing values in the 'age' column with the median age
    - Fill missing values in the 'fare' column with the median fare
    - Convert the 'embarked' column to numerical values: S = 0, C = 1, Q = 2
    - Convert the 'class' column to numerical values: First = 1, Second = 2, Third = 3
    - Convert the 'who' column to numerical values: man = 0, woman = 1, child = 2
    - Convert the 'adult_male' column to integer values
    - Convert the 'alone' column to integer values

- For each row in the DataFrame:
    - Extract the values of the following features: pclass, sex, age, sibsp, parch, fare, embarked, who, adult_male, alone
    - Initialize the prediction variable 'y' to 0
    - Apply the prediction logic:
        - If the passenger is female, add 1 to 'y'
        - If the passenger is in first cla