In [70]:
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback
import re

import numpy as np

import warnings
warnings.filterwarnings('ignore')


class Classification():
    def __init__(
        self, 
        llm_model_name, 
        params
        ):
        self.llm_model_name = llm_model_name
        self.llm_model = OpenAI(temperature=0, model_name = self.llm_model_name)

        #self.llm_model = llm_model,
        self.columns_name = params['columns_name']

        self.model_code = None

    def train(self, x, y, model_name, file_path=None):
        print("> Start of model training.")
        df = x.copy()

        df['target'] = y

        # Determine whether binary or multivalued classification is used
        if len(df['target'].unique()) == 2:
            task_type = 'binary classification'
            output_code = 'y = 1 / (1 + np.exp(-y))'
        else:
            task_type = 'multi-class classification'

        # データ型の取得
        data_type = ', '.join(df.dtypes.astype(str))



        # Create a string dataset
        dataset = []
        for index, row in df.iterrows():
            row_as_str = [str(item) for item in row.tolist()] 
            dataset.append(','.join(row_as_str))
        dataset_str = '\n'.join(dataset)


        # column name
        if self.columns_name:
            col_name = ', '.join(df.columns.astype(str))

        else:
            # serial number
            df.columns = range(df.shape[1])
            col_name = ', '.join(df.columns.astype(str))

        create_prompt = """
        Please create your code in compliance with all of the following conditions. Output should be code only. Do not enclose the output in ``python ``` or the like.
        ・Analyze the large amount of data below and create a {task_type_} code to accurately predict "target".
        ------------------
        {dataset_str_}
        ------------------
        ・Each data type is as follows. If necessary, you can change the data type.
        ・Create code that can make predictions about new data based on logic from large amounts of input data without using machine learning models.
        ・If input is available, the column names below should also be used to help make decisions when creating the predictive model. Column Name:{col_name_}
        ・Create a code like the following. Do not change the input or output format.
        ・Use {model_name_} for the function name.
        ・You do not need to provide examples.
        ------------------
        import numpy as np

        def model(x):
            df = x.copy()

            output = []
            for index, row in df.iterrows():


                # Feature creation and data preprocessing


                {output_code_}
                output.append(y)

            output = np.array(output)
                
            return output
        """.format(
            task_type_ = task_type,
            dataset_str_ = dataset_str,
            model_name_ = model_name,
            col_name_ = col_name,
            output_code_ = output_code
            )

        #print(create_prompt)

        with get_openai_callback() as cb:
            model_code = self.llm_model(create_prompt)
            print(cb)


        # Save to File
        if file_path != None:
            with open(file_path + f'{model_name}.py', mode='w') as file:
                file.write(model_code)


        self.model_code = model_code

        return model_code

    def predict(self, x):
        if self.model_code is None:
            raise Exception("You must train the model before predicting!")


        code = self.model_code + '\n' + f'model = model({x})'
        exec(self.model_code, globals())

        #model = namespace["code"]
        
        y = model(x)

        return np.array(y)




    def interpret(self):
        if self.model_code is None:
            raise Exception("You must train the model before interpreting!")

        interpret_prompt = """
        Refer to the code below and explain how you are going to process the data and make predictions.
        The only part to explain is the part where the data is processed.
        Do not explain df = x.copy().
        Please output the data in bulleted form.
        Please tell us what you can say based on the whole process.
        ------------------
        {model_code_}
        """.format(
            model_code_ = self.model_code
        )

        with get_openai_callback() as cb:
            output = self.llm_model(interpret_prompt)
            print(cb)


        return output

# Training

In [71]:
import pandas as pd

from langchain.llms import OpenAI

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [72]:
df = pd.read_csv('../data/learning.csv')
x_train = df.drop('survived', axis=1)
y_train = df['survived']

In [73]:
llm_model_name = 'gpt-4'

params = {'columns_name': True}

ibl = Classification(llm_model_name=llm_model_name, params=params)

In [74]:
model = ibl.train(x_train, y_train, model_name = 'titanic', file_path='./')

> Start of model training.
Tokens Used: 4053
	Prompt Tokens: 3411
	Completion Tokens: 642
Successful Requests: 1
Total Cost (USD): $0.14085


In [75]:
print(model)

import numpy as np
import pandas as pd

def titanic(data):
    # Convert input data to DataFrame
    data = [row.split(',') for row in data.split('\n')]
    columns = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone', 'target']
    df = pd.DataFrame(data, columns=columns)
    
    # Convert data types
    df['pclass'] = df['pclass'].astype(int)
    df['age'] = pd.to_numeric(df['age'], errors='coerce')
    df['sibsp'] = df['sibsp'].astype(int)
    df['parch'] = df['parch'].astype(int)
    df['fare'] = pd.to_numeric(df['fare'], errors='coerce')
    df['target'] = df['target'].astype(int)
    
    # Feature creation and data preprocessing
    df['sex'] = df['sex'].map({'male': 0, 'female': 1})
    df['embarked'] = df['embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    df['class'] = df['class'].map({'First': 1, 'Second': 2, 'Third': 3})
    df['who'] = df['who'].map({'man': 0, 'woman': 1, 'child': 2})
    df['adul

## Prediction

In [66]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [67]:
df = pd.read_csv('../data/pred.csv')
x_test = df.drop('survived', axis=1)
y_test = df['survived']

In [76]:
y_proba = ibl.predict(x_test)
y_pred = (y_proba > 0.5).astype(int)

TypeError: 'str' object is not callable

In [54]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

# Recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall}')

# F1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 score: {f1}')

# ROC-AUC (you need prediction probabilities for this, not just class predictions)
# Here we just reuse y_pred for simplicity
roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC-AUC: {roc_auc}')

Accuracy: 0.41
Precision: 0.37894736842105264
Recall: 1.0
F1 score: 0.549618320610687
ROC-AUC: 0.9309895833333334


## Prediction from external files


In [69]:
from titanic import titanic

y_proba = model(x_test)
y_pred = (y_proba > 0.5).astype(int)


TypeError: 'str' object is not callable

In [59]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

# Recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall}')

# F1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 score: {f1}')

# ROC-AUC (you need prediction probabilities for this, not just class predictions)
# Here we just reuse y_pred for simplicity
roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC-AUC: {roc_auc}')

Accuracy: 0.41
Precision: 0.37894736842105264
Recall: 1.0
F1 score: 0.549618320610687
ROC-AUC: 0.9309895833333334


## Interpretation of results

In [100]:
description = ibl.interpret()

Tokens Used: 1065
	Prompt Tokens: 552
	Completion Tokens: 513
Successful Requests: 1
Total Cost (USD): $0.04733999999999999


In [101]:
print(description)

The data processing and prediction steps in the given code can be explained as follows:

- Preprocessing:
    - Convert the 'sex' column to numerical values: male = 0, female = 1
    - Fill missing values in the 'age' column with the median age
    - Fill missing values in the 'fare' column with the median fare
    - Convert the 'embarked' column to numerical values: S = 0, C = 1, Q = 2
    - Convert the 'class' column to numerical values: First = 1, Second = 2, Third = 3
    - Convert the 'who' column to numerical values: man = 0, woman = 1, child = 2
    - Convert the 'adult_male' column to integer values
    - Convert the 'alone' column to integer values

- For each row in the DataFrame:
    - Extract the values of the following features: pclass, sex, age, sibsp, parch, fare, embarked, who, adult_male, alone
    - Initialize the prediction variable 'y' to 0
    - Apply the prediction logic:
        - If the passenger is female, add 1 to 'y'
        - If the passenger is in first cla

In [60]:
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
})

# データフレームのカラム名を連続する整数に置き換えます。
df.columns = range(df.shape[1])

# 結果を表示します。
print(df)

   0  1  2
0  1  4  7
1  2  5  8
2  3  6  9
