# End to End ML with GPT-3.5
Learn how to use GPT-3.5 to do the heavy lifting for data acquisition, preprocessing, model training, and deployment

## Intro

## Init

In [8]:
import openai
import os

openai.api_key = os.environ["OPENAI_API_KEY"] #os.getenv('OPENAI_API_KEY')

def get_api_result(prompt):
    request = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-0301",
        messages=[{"role": "user", "content": prompt}]
    )
    
    result = request['choices'][0]['message']['content']

    print(result)

## Extract, transform, load (ETL)

In [9]:
def extract(prompt):
    prompt_template = f"""You are a ChatGPT language model that can generate Python code. 
    Please provide a natural language input text, and I will generate the corresponding Python code.
    \nInput: {prompt}
    \nPython code:"""

    get_api_result(prompt_template)

In [13]:
%%time
prompt = '''
Retrieve the adult income prediction dataset from openml using the sklearn fetch_openml function. 
Make sure to retrieve the data as a single dataframe "df" which includes the target in a column named “target”. 
'''

extract(prompt)
print('-------------')

import sklearn.datasets as datasets

# Fetch the adult income prediction dataset from openml using fetch_openml function
adult_income_dataset = datasets.fetch_openml(name='adult', version=2)

# Load the dataset into a pandas dataframe with target in a column named 'target'
df = pd.DataFrame(adult_income_dataset.data, columns=adult_income_dataset.feature_names)
df['target'] = adult_income_dataset.target
-------------
CPU times: total: 0 ns
Wall time: 3.28 s


In [18]:
%%time
prompt = '''
Retrieve the adult income prediction dataset from csv.
Make sure to retrieve the data as a single dataframe which includes the target in a column named “target”. 
Asign names to other columns too.
'''

extract(prompt)
print('-------------')

import pandas as pd

df = pd.read_csv('adult_income_prediction.csv')
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'target']
-------------
CPU times: total: 15.6 ms
Wall time: 3.29 s


In [22]:
# def transform(columns, column_types, prompt):
def transform(prompt):
    prompt_template = f"""You are a ChatGPT language model that can generate Python code. 
    Please provide a natural language input text, 
    and I will generate the corresponding Python code using the Pandas to preprocess the given DataFrame df. 
    \nInput: {prompt}
    \nPython code:"""
    # The DataFrame columns are {columns} and their corresponding dtypes are {column_types}.

    get_api_result(prompt_template)

In [23]:
%%time
prompt = '''
Preprocess the dataframe by converting all categorical columns to their one-hot encoded equivalents, and normalizing numerical columns. 
Drop rows which have an NA or NaN value in any column. Drop rows that have numeric column outliers as determined by their z score. 
A numeric column outlier is a value that is outside of the 1 to 99 inter-quantile range. 
The numerical columns should be normalized using StandardScaler from sklearn. 
The values in the target colummn should be converted to 0 or 1 and should be of type int.
'''
transform(prompt)
print('-----------------------')

import pandas as pd
from sklearn.preprocessing import StandardScaler

# Read in the dataframe
df = pd.read_csv('your_data.csv')

# Convert categorical columns to one-hot encoding
df = pd.get_dummies(df)

# Normalize numerical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Drop rows with NA or NaN values
df.dropna(inplace=True)

# Drop rows with numeric column outliers as determined by their z score
z_scores = pd.DataFrame()
for col in num_cols:
    z_scores[col] = (df[col] - df[col].mean()) / df[col].std(ddof=0)
    z_scores = z_scores.abs().lt(3)
df = df[z_scores.all(axis=1)]

# Convert target column values to 0 or 1 and change column type to int
df['target_column'] = df['target_column'].apply(lambda x: 0 if x == 'value_1' else 1).astype(int)
-----------------------
CPU times: total: 0 ns
Wall time: 8.06 s


In [24]:
def load(prompt):
    prompt_template = f"""You are a ChatGPT language model that can generate Python code. 
    Please provide a natural language input text, and I will generate the corresponding Python code.
    \nInput: {prompt}
    \nPython code:"""

    get_api_result(prompt_template)

In [25]:
%%time

prompt = '''Connect to an sqlite database named “data”. 
Use pandas to insert data from a DataFrame named “df” into a table named “income”. 
Do not include the index column. 
Commit the changes before closing the connection.'''

load(prompt)
print('-----------------------')

To connect to an SQLite database named "data", you can use the following code:

```python
import sqlite3

conn = sqlite3.connect("data.db")
```

Next, to insert data from a pandas DataFrame named "df" into a table named "income" without including the index column, you can use the following code:

```python
import pandas as pd

df.to_sql("income", conn, if_exists="replace", index=False)
```

Here, `df` is the pandas DataFrame that you want to insert, `if_exists` is set to "replace" to ensure that any existing table with the same name is dropped and recreated, and `index` is set to `False` to exclude the index column from the table.

Finally, you can commit the changes and close the connection using the following code:

```python
conn.commit()
conn.close()
```
-----------------------
CPU times: total: 0 ns
Wall time: 6.41 s


## Model Training

In [26]:
def train(prompt):
    prompt_template = f"""You are a ChatGPT language model that can generate Python code. 
    Focus on using scikit-learn when applicable. 
    Please provide a natural language input text, and I will generate the corresponding Python code.
    \nInput: {prompt}
    \nPython code:"""

    get_api_result(prompt_template)

In [28]:
%%time

prompt = '''
Train a variety of classification models to predict the “target” column using all other columns. 
Do so using 5-fold cross validation to choose the best model and corresponding set of hyperparameters.
Return the best overall model and corresponding hyperparameter settings. 
Choose the best model based on accuracy. 
Assume a dataframe named “df” exists which is to be used for training. 
Log the entire process using MLFlow. 
Start logging with mlflow before training any models so only a single run is stored. 
Make sure that the model is logged using the sklearn module of mlflow. 
Make sure that only the best overall model is logged, but log metrics for all model types. 
The mean value of the following metrics on all cross validation folds should be logged: accuracy, AUC, F1 score.
'''

train(prompt)
print('-----------------------')

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import mlflow
from mlflow.sklearn import log_model

# Load dataframe
df = pd.read_csv('data.csv')

# Define X and y
X = df.drop('target', axis=1)
y = df['target']

# Define classification models
models = [RandomForestClassifier(), GradientBoostingClassifier(), LogisticRegression()]

# Define hyperparameters to tune for each model
params = [{'n_estimators': [50, 100, 150], 'max_depth': [5, 10, 15]},
          {'n_estimators': [50, 100, 150], 'max_depth': [5, 10, 15], 'learning_rate': [0.01, 0.1]},
          {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']}]

# Define scoring metrics to log
scoring = ['accuracy', 'roc_auc', 'f1']

# Start mlflow run
mlflow.start_run

## Model Serving

In [29]:
def serve_model(model_path, prompt):
    prompt_template = f"""You are a ChatGPT language model that can generate shell code for deploying models using MLFlow. 
    Please provide a natural language input text, and I will generate the corresponding command to deploy the model. 
    The model is located in the file {model_path}.
    \nInput: {prompt}
    \nShell command:"""

    get_api_result(prompt_template)

In [30]:
%%time

prompt = '''Serve the model using port number 1111, and use the local environment manager'''
model_path = '<model path here>'

serve_model(model_path, prompt)

print('-------------------------')


mlflow models serve -m <model path here> -p 1111 --no-conda
-------------------------
CPU times: total: 0 ns
Wall time: 1 s


In [31]:
def send_request(prompt):
    prompt_template = f"""You are a ChatGPT language model that can generate code for sending data to deployed MLFlow models. 
    Please provide a natural language input text, and I will generate the corresponding command. 
    \nInput: {prompt}
    \nCommand:"""

    get_api_result(prompt_template)

In [32]:
%%time

prompt = '''Use the “curl” command to send data “<data here>” to an mlflow model hosted at port 1111 on localhost. 
Make sure that the content type is "application/json".
Write the command in one line'''

send_request(prompt)
print('-------------------------')

curl -X POST -H "Content-Type: application/json" -d '{"data here"}' http://localhost:1111/invocations
-------------------------
CPU times: total: 31.2 ms
Wall time: 1.4 s


In [33]:
def modify_request(prompt):
    prompt_template = f"""You are a ChatGPT language model that can modify commands for sending data using "curl". 
    Please provide a natural language instruction, corresponding command, and I will generate the modified command. 
    \nInput: {prompt}
    \nCommand:"""

    get_api_result(prompt_template)

In [34]:
%%time
code = """curl -X POST -H "Content-Type: application/json" -d '<data here>' http://localhost:1111/invocations"""
prompt = f"""Please modify the following by placing the url before the "-X POST" argument:\n{code}"""

modify_request(prompt)
print('-------------------------')

"Add the url http://example.com before the -X POST argument"

Modified command: curl http://example.com -X POST -H "Content-Type: application/json" -d '<data here>' http://localhost:1111/invocations
-------------------------
CPU times: total: 0 ns
Wall time: 2.1 s


In [35]:
def create_payload(prompt):
    prompt_template = f"""You are a ChatGPT language model that can generate code for sending data to deployed MLFlow models. 
    Please provide a natural language input text, and I will generate the corresponding command. 
    \nInput: {prompt}
    \nPython code:"""

    get_api_result(prompt_template)

In [36]:
%%time

prompt = '''
Convert the DataFrame “df” to json format that can be received by a deployed MLFlow model. 
Wrap the resulting json in an object called “dataframe_split”. 
Also, “dataframe_split” should be surrounded by doubles quotes instead of single quotes. 
Do not include the “target” column. 
Use the split “orient” argument
'''
# The resulting string should not have newlines, and it should not escape quotes. 

create_payload(prompt)
print('-------------------------')

import pandas as pd
import json

df = pd.DataFrame() # Replace with actual data

# Drop the target column
df = df.drop(['target'], axis=1)

# Convert DataFrame to JSON format
json_data = df.to_json(orient='split')

# Wrap JSON data in an object and rename it
new_json_data = {"dataframe_split": json.loads(json_data)}

# Replace all single quotes with double quotes
json_string = json.dumps(new_json_data).replace("'", '"')

# Finally, send the JSON-formatted data to the deployed MLFlow model
# with the appropriate API call.
-------------------------
CPU times: total: 0 ns
Wall time: 4.6 s


## Conclusions