# Synthetic data capability

## Summary

The use case provided in this notebook creates synthetic training data sets for use in DataRobot models.

This notebook outlines how to create a synthetic training data set in a csv file, with name, address, phone number, company, account number, and credit score.

## Setup

### Import libraries

In [None]:
from io import StringIO

import datarobot as dr
from datarobot import Dataset as ds
from faker import Faker

## Generate synthetic data

In [None]:
# Create a csv file with 10000 rows consisting of these columns:
# fake first name
# fake last name
# fake address
# phone number
# company
# fake account number
# credit score (random number between 300-850)
# good loan candidate (T/F)

Faker.seed(0)
fake = Faker()
fake.set_arguments("credit_score", {"min_value": 300, "max_value": 850})
people_csv = fake.csv(
    header=(
        "Name",
        "Address",
        "Phone_Number",
        "Company",
        "Account_Number",
        "Credit_Score",
        "Good_Loan_Candidate",
    ),
    data_columns=(
        "{{name}}",
        "{{address}}",
        "{{phone_number}}",
        "{{company}}",
        "{{bban}}",
        "{{pyint:credit_score}}",
        "{{boolean}}",
    ),
    num_rows=10000,
    include_row_ids=True,
)

## Data Frame for Output

In [None]:
# Use StringIO to create a file-like object for pandas to read from
csv_file = StringIO(people_csv)

# Read the CSV into a DataFrame
df = pd.read_csv(csv_file)

# Now 'df' is your DataFrame
print(df)

## Load CSV into AI Catalog

In [None]:
# write synthetic data csv to a file on disk
with open("people.csv", "w") as file:
    file.write(people_csv)

# push that to datarobot
https://datarobot-public-api-client.readthedocs-hosted.com/en/latest-release/autodoc/api_reference.html#datasets

people_dataset = ds.upload("people.csv")

# get the dataset id
people_dataset_id = people_dataset.id

## Load synthetic data into AutoML

## Initiate autopilot

In [None]:
project.analyze_and_model(
    target="Good_Loan_Candidate", metric="Tweedie Deviance", mode=dr.AUTOPILOT_MODE.FULL_AUTO
)

## Retrieve top performing model

In [None]:
def sorted_by_metric(models, test_set, metric):
    models_with_score = [model for model in models if model.metrics[metric][test_set] is not None]

    return sorted(models_with_score, key=lambda model: model.metrics[metric][test_set])


models = project.get_models()

metric = project.metric

# Get the top-performing model
model_top = sorted_by_metric(models, "crossValidation", metric)[0]

print(
    """The top performing model is {model} using metric, {metric}""".format(
        model=str(model_top), metric=metric
    )
)

## Deploy chosen model

In [None]:
# Get the prediction server
prediction_server = dr.PredictionServer.list()[0]

# Create a deployment
deployment = dr.Deployment.create_from_learning_model(
    model_top.id,
    label="Synthetic data test",
    description="Model trained on synthetic dataset with names, addresses, credit scores, etc.",
    default_prediction_server_id=prediction_server.id,
)
deployment.id