In [4]:
%%capture
!pip install git+https://github.com/fuyu-quant/IBLM.git@develop

In [5]:
import pkg_resources
print(pkg_resources.get_distribution('IBLM').version)

0.2.13


In [31]:
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import r2_score
import pandas as pd
from langchain.llms import OpenAI

from iblm import IBLModel

In [8]:
data = fetch_california_housing()

# 説明変数をDataFrameとして取得
df = pd.DataFrame(data['data'], columns=data['feature_names'])

# 目的変数をDataFrameに追加
df['target'] = data['target']

df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [10]:
len(df)

20640

In [54]:
df_train = df.head(100)

In [55]:
x_train = df_train.drop('target', axis=1)
y_train = df_train['target']

In [56]:
llm_model = OpenAI(temperature=0, model_name = 'gpt-4-0613')

params = {
    'columns_name': True,
    'objective': 'regression',
    }
iblm = IBLModel(llm_model = llm_model, params = params)

In [57]:
prompt = """
Please create your code in compliance with all of the following conditions. 
・Never output anything other than Python code.
・Generate Python code to analyze the following data and predict the value of target.
・Never use machine learning algorithms.
・Analyze the data in as much detail as possible.
・Find linear and nonlinear relationships in your data and use them to make predictions.
・Do not output any examples.
・The 'target' value cannot be used for forecasting.
・Each data type is {data_type_}.
・The column names, in order, are as follows {col_name_}.
・Create a code like the following.
------------
import numpy as np
def predict(x):
    df = x.copy()
    output = []
    for index, row in df.iterrows():
        # Do not change the code before this point.
        # Please describe the process required to make the prediction below.

        # Do not change the code after this point.
        output.append(y)
    return np.array(output)
------------
data
------------
{dataset_str_}
"""

In [58]:
model = iblm.fit(x_train, y_train, prompt = prompt)

Here is a Python code that uses simple statistical analysis to predict the target value. This code assumes that the target value has a linear relationship with the 'MedInc' column and a nonlinear relationship with the 'AveRooms' column. 

```python
import numpy as np
import pandas as pd

def predict(x):
    df = x.copy()
    output = []
    for index, row in df.iterrows():
        # Do not change the code before this point.
        # Please describe the process required to make the prediction below.

        # Assuming a linear relationship with 'MedInc' and a nonlinear relationship with 'AveRooms'
        y = 0.5 * row['MedInc'] + 0.3 * np.square(row['AveRooms'])

        # Do not change the code after this point.
        output.append(y)
    return np.array(output)
```

This code first copies the input DataFrame to avoid modifying the original data. It then iterates over each row in the DataFrame. For each row, it calculates a predicted target value based on a linear relationship wit

In [59]:
print(model)

import numpy as np
import pandas as pd

def predict(x):
    df = x.copy()
    output = []
    for index, row in df.iterrows():
        # Do not change the code before this point.
        # Please describe the process required to make the prediction below.

        # Assuming a linear relationship with 'MedInc' and a nonlinear relationship with 'AveRooms'
        y = 0.5 * row['MedInc'] + 0.3 * np.square(row['AveRooms'])

        # Do not change the code after this point.
        output.append(y)
    return np.array(output)


### Prediction

In [60]:
df_test = df.tail(300)
x_test = df_test.drop('target', axis=1)
y_test = df_test['target']

In [61]:
y_pred = iblm.predict(x_test)

In [62]:
r2 = r2_score(y_test, y_pred)
print('R-squared: ', r2)

R-squared:  -118.31567941582905
