# Finetuning Experiments 1

In [None]:
from dotenv import load_dotenv

load_dotenv()



In [None]:
import os


In [None]:
!pip3 install openai
!pip3 install psycopg2


## Test using anyscale 

In [None]:
import openai

query = "Write a program to load data from S3 with Ray and train using PyTorch."

client = openai.OpenAI(
    base_url = "https://api.endpoints.anyscale.com/v1",
    api_key = os.getenv("API_KEY")
)
# Note: not all arguments are currently supported and will be ignored by the backend.
chat_completion = client.chat.completions.create(
    model="mistralai/Mixtral-8x7B-Instruct-v0.1:carl:7umGKyN",
    messages=[{"role": "system", "content": "You are an excellent technical product manager that loves reading code and describing it to non-technical people.\n\nYour task is to write a spec for a component given its implementation.\n\nYou will also be provided with a original spec that resulted in the implementation that you are considering. This should be used to influence language and style of the spec you provide.\n\nFor instance, you might be given a component that implements a button, and you will be provided with a spec that describes how the button was used, its layout in a original page or more complex matters.\n\nYour goal is to write a spec that describes the component in a way that is consistent with the original spec.\n\nRules:\n- Your spec should be written in a way that is easy to understand by non-technical people, though it should match the tone, language, and style of the original spec. If that original spec is technical, you may break this rule\n- The spec must also match the same language that is used in the original spec\n- You must respond only with the spec, and nothing else\n- You must not refer to the code directly, but instead refer to the features and styles that you would propose\n- You can be detailed and specific, but keep the spec simple and terse. Do not use generalities, do not use abstract language\n- Do not mention the name of the component\n- Do NOT describe the nature or essence of the component (e.g. \"this is a visually appealing component\"). Instead be specific about the features and styles that describe this component\n- Do NOT summarize the spec at the beginning or end\n- Do NOT reference specific tailwind classes, css, or other implementation details\n- Keep it short and to the point\n- You should never generate more than 50 words\n"},
              {"role": "user", "content": "\n    Can you write a spec for this component given its implementation?\n\nComponent implementation:\n###\n\u003cimg\n                    src={`dashboard-thumbnail${index + startIndex}.jpg`}\n                    alt={`Digital signage thumbnail for ${client.id}`}\n                    className=\"w-full h-[60px] object-cover\"\n                  /\u003e\n###\n\n"}],
    temperature=0.1,
    stream=True
)
for message in chat_completion:
    print(message.choices[0].delta.content, end="", flush=True)

In [None]:
import os
from dotenv import load_dotenv
import psycopg2

# Load environment variables from .env file
load_dotenv()

# Retrieve the database URL from the environment variable
db_url = os.getenv("DB_URL")

# Establish a connection to the PostgreSQL database
conn = psycopg2.connect(db_url)

try:
    # Create a cursor object to execute queries
    cur = conn.cursor()

    # Define the query to select records
    query = "SELECT * FROM query_records WHERE api_key = '53cbdc5f-0ea9-4f6d-b466-d2127241f77c'"

    # Execute the query
    cur.execute(query)

    # Fetch all the records and store them in a variable
    records = cur.fetchall()

    # Print the number of records retrieved
    print(f"Retrieved {len(records)} records from the database.")

    # You can now work with the 'records' variable, which contains all the retrieved records

finally:
    # Close the cursor and the database connection
    cur.close()
    conn.close()
    
print(records[0])

In [None]:
import json
import re

class CustomDecoder(json.JSONDecoder):
    def decode(self, s):
        result = super().decode(s)
        return self._decode(result)

    def _decode(self, o):
        if isinstance(o, str):
            o = o.replace('\\_', '_')
        elif isinstance(o, dict):
            for k, v in o.items():
                o[k] = self._decode(v)
        elif isinstance(o, list):
            for i, v in enumerate(o):
                o[i] = self._decode(v)
        return o

def parse_records(records):
    formatted_records = []

    for record in records:
        try:
            data = json.loads(record[6], cls=CustomDecoder)
            messages = data["messages"]

            system_content = messages[0]["content"].strip()
            user_content = messages[1]["content"].strip()

            # Extract the assistant's response from the data field
            assistant_content = ""
            data_field = record[7]
            match = re.search(r"data: (.*?)\n\ndata: \[DONE\]", data_field, re.DOTALL)
            assistant_content_parsed = ""
            if match:
                assistant_content = match.group(1).strip().replace("data: ", "")
                for line in assistant_content.split("\n"):
                    try:
                        json_line = json.loads(line, cls=CustomDecoder)
                        if 'delta' in json_line['choices'][0] and 'content' in json_line['choices'][0]['delta']:
                            assistant_content_parsed += json_line['choices'][0]['delta']['content']
                    except (KeyError, IndexError, json.JSONDecodeError):
                        continue

            # Skip the record if any of the content fields are empty
            if not system_content or not user_content or not assistant_content_parsed:
                continue

            formatted_record = {
                "messages": [
                    {"role": "system", "content": system_content},
                    {"role": "user", "content": user_content},
                    {"role": "assistant", "content": assistant_content_parsed}
                ]
            }
            formatted_records.append(json.dumps(formatted_record))
        except (KeyError, IndexError, json.JSONDecodeError) as e:
            # Skip the record if it doesn't fit the expected format
            print(e)
            continue

    return formatted_records

# Assuming the records are stored in a list called 'records'
formatted_records = parse_records(records)

with open("create_finetune_data.jsonl", "w") as file:
    file.write("\n".join(formatted_records))

In [None]:
import openai

client = openai.OpenAI(
    base_url = "https://api.endpoints.anyscale.com/v1",
    api_key = os.getenv("API_KEY")
)

file = client.files.create(
    file=open('create_finetune_data.jsonl', "rb"),
    purpose="fine-tune",
    # user_provided_filename='create_finetune_data.jsonl',
)

In [None]:
print(file)

In [None]:
client.fine_tuning.jobs.create(
    model="mistralai/Mixtral-8x7B-Instruct-v0.1",
    training_file="file_4am6fv7lzbh52ca5vuv65p58dl",
)