In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

def train_myData(X, y):
    """a function to take X and y and train & test a linear regression model and return it
    parameters:
        X: data containing the features
        y: data containing the target
    Returns:
        trained model
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the linear regression model
    model = LinearRegression()

    # Train the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("Mean Squared Error:", mse)
    print("R-squared Score:", r2)

    return model


In [2]:
## import API keys from .env file in this folder
import os
from dotenv import load_dotenv
load_dotenv(".env")

openai_api_key    = os.getenv("openai_api_key")

In [3]:
#%pip install --upgrade --quiet  langchain langchain_experimental langchain-openai


In [4]:
from langchain_core.pydantic_v1 import BaseModel
class BMI(BaseModel):
    BMI: int
    Height: float
    Weight: float

In [5]:
examples = [
    {
        "example": """BMI: 26 Height: 6 Weight: 190"""
    },
        {
        "example": """BMI: 20 Height: 5 Weight: 100"""
    }
]

In [6]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX, SYNTHETIC_FEW_SHOT_SUFFIX
    )

OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

In [7]:
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator
from langchain_openai import ChatOpenAI

synthetic_data_generator = create_openai_data_generator(
    output_schema=BMI,
    llm=ChatOpenAI(
        temperature=1,openai_api_key=openai_api_key
    ),  # You'll need to replace with your actual Language Model instance
    prompt=prompt_template,
)

In [8]:
synthetic_results = synthetic_data_generator.generate(
    subject="BMI_Weight_Height",
    extra="""randomly make a few of BMI -1. A few of BMI must be -1.""", 
    # extra="", ## explain how the generated data should be
    runs=10, ## number of outputs
)

In [9]:
synthetic_results

[BMI(BMI=26, Height=6.0, Weight=190.0),
 BMI(BMI=20, Height=5.0, Weight=100.0),
 BMI(BMI=26, Height=6.0, Weight=190.0),
 BMI(BMI=-1, Height=5.5, Weight=150.0),
 BMI(BMI=26, Height=6.0, Weight=190.0),
 BMI(BMI=-1, Height=5.5, Weight=150.0),
 BMI(BMI=26, Height=6.0, Weight=190.0),
 BMI(BMI=-1, Height=5.5, Weight=150.0),
 BMI(BMI=26, Height=6.0, Weight=190.0),
 BMI(BMI=-1, Height=5.5, Weight=150.0)]

In [10]:
import pandas as pd
import numpy as np

In [11]:
bmi_arr = []
height_arr = []
weight_arr = []
for data in synthetic_results:
    if int(data.BMI) == -1:
        bmi_arr.append(np.nan)
    else:
        bmi_arr.append(data.BMI)
    height_arr.append(data.Height)
    weight_arr.append(data.Weight)

In [12]:
df = pd.DataFrame({
    "BMI": bmi_arr,
    "Height": height_arr,
    "Weight": weight_arr 
})

In [13]:
df

Unnamed: 0,BMI,Height,Weight
0,26.0,6.0,190.0
1,20.0,5.0,100.0
2,26.0,6.0,190.0
3,,5.5,150.0
4,26.0,6.0,190.0
5,,5.5,150.0
6,26.0,6.0,190.0
7,,5.5,150.0
8,26.0,6.0,190.0
9,,5.5,150.0


In [14]:
X = np.array(df[['Height', 'Weight']])
y = np.array(df['BMI'])

In [15]:
train_myData(X,y)

ValueError: Input y contains NaN.