In [None]:
import pandas as pd
import openai
import os

# Connect with OpenAI

To make this notebook work, you need an Open AI account and generate an API key. The API key has to be stored in an openai.key file.

In [None]:
with open("openai.key") as f:
    openai.api_key = f.read().strip()

# Settings for the dataset

In [None]:
n = 50
p = 5
dataset_name = "Light Bulb Changes"

# Putting Together the Prompt

By modifying the prompt, you can change the datasets that you get. Try for example "type: classification" or add a target variable

In [None]:
prompt = """%s

Size: n=%s rows
Number of features: %s

This dataset is perfect for education, because it has a mix of
numerical and categorical features, there are no missing values,
and interesting correlations structures. Column names are all
lower case and without special characters. The dataset is very
realistic.

Raw CSV file (first column is id, showing all n rows,
just copy and paste everything between --- and ---):

---

""" % (dataset_name, n, p)

# Make The Request And Get CSV

In [None]:
response = openai.Completion.create(model="text-davinci-003", prompt=prompt, temperature=0, max_tokens=3000, stop="---")

In [None]:
result = response.choices[0].text

In [None]:
with open(r"data.csv", "w") as f:
    f.write(result)

# Inspect the Results

In [None]:
dat = pd.read_csv("data.csv", index_col = 0)
dat.sample(5)

# Visualize data

In [None]:
from dataprep.eda import create_report
from dataprep.eda import plot
from dataprep.eda import plot_correlation

plot(dat, "bulb_type", "hours_used")

In [None]:
plot(dat)

In [None]:
plot_correlation(dat)