In [258]:
!pip install pypandoc
!pip install -qU \
    datasets==2.14.4 \
    langchain==0.0.274 \
    pinecone-client==2.2.2 \
    openai==0.27.9
!pip install pyrate-limiter
!pip install jsonlines



In [276]:
import pypandoc
import re
from pyrate_limiter import Duration, Limiter, Rate
import time
import openai
import os

In [314]:
openai.organization = ""
openai.api_key = ""

import os
os.environ["OPENAI_API_KEY"] = openai.api_key

PREPARE THE DATA

In [6]:
journal = open("2015.txt","r")
journal_txt = journal.read()

In [7]:
splits = ['\n','|','--','+','=']
for s in splits:
  journal_txt = journal_txt.replace(s, "")
journal_txt = journal_txt.replace("  ", " ")
journal_list= journal_txt.split(" ")

In [8]:
def raw_text_to_entries(journal_list):

  months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
  dates = ['(st)', '(nd)', '(rd)', '(th)']
  month_index = {months[i]: i+1 for i in range(len(months)) }
  year=2015

  month=0
  date=0
  day=""
  date_new="0-0-0"
  location=""
  entries = []
  entry = []

  for i in range(len(journal_list)):

      if journal_list[i] in months:
        if journal_list[i-1][-4:] in dates:

          entries.append({
              'date' : date_new,
              'location' : location,
              'entry' : " ".join(entry[:-3])
          })
          entry=[]

          month = journal_list[i]
          date = journal_list[i-1][:-5]
          day = journal_list[i-2][:-1]
          l = 3
          while not len(journal_list[i-l])>0:
            l+=1
          location = journal_list[i-l]
          date_new = f"{month_index[month]:02}-{int(date):02}-{year}"
      else:
        entry.append(journal_list[i])

  for l in range(i,len(journal_list)):
    entry.append(journal_list[l])

  entries.append({
      'date' : date_new,
      'location' : location,
      'entry' : " ".join(entry)
  })
  entries = entries[1:]
  return entries

In [356]:
entries = raw_text_to_entries(journal_list)
entries[279]["date"]

'10-07-2015'

GENERATE PROMPTS

In [316]:
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI

In [317]:
response_schemas = [
    ResponseSchema(name="prompt", description="question from the question-answer pair"),
    ResponseSchema(name="completion", description="answer from the question-answer pair")
]

In [318]:
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
print(output_parser)

response_schemas=[ResponseSchema(name='prompt', description='question from the question-answer pair', type='string'), ResponseSchema(name='completion', description='answer from the question-answer pair', type='string')]


In [319]:
format_instructions = output_parser.get_format_instructions()
print(format_instructions)

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"prompt": string  // question from the question-answer pair
	"completion": string  // answer from the question-answer pair
}
```


In [320]:
template1 = """
  I am the narrator of the given text. You are a compassionate psychologist who wants to get to know me by asking
  insightful, thought-provoking, meaningful questions about my day.
  Generate atleast 3 interesting thought-provoking question-answer pairs from the given text.
  The answers should be expressive with as many details as possible. Strictly follow the output formatting given below\n
  Date: {entry_date}\n
  Location: {entry_location}\n
  Journal entry: {entry_text}\n\n
"""

In [321]:

prompt = PromptTemplate(
    template=template1 + "\n{format_instructions}",
    input_variables=["entry_date", "entry_location", "entry_text"],
    partial_variables={"format_instructions": format_instructions}
)

In [323]:
llm = OpenAI(temperature=0.1, model="text-davinci-003", max_tokens=1024)

In [None]:
results = []

for entry in entries:
  prompt_ = prompt.format(
      entry_date = entry["date"],
      entry_location = entry['location'],
      entry_text = entry["entry"]
  )
  outputs = llm(prompt_)
  outputs = outputs.split("\n\n")[1:]
  while len(output) > 0 and not output[0].startswith("```json"):
    output = output[1:]

  print(outputs)
  for output in outputs[1:]:
    out = output_parser.parse(output)
    results.append(out)
    # print(results)
  print(entry['date'])



In [360]:
len(results)

849

CONVERT PROMPT TEXT FILE TO JSONL FILE

In [361]:
import json

with open("data.jsonl", 'a') as f:
    for item in results:
        f.write(json.dumps(item) + "\n")

PREPARE FOR FINE-TUNING JOB

In [362]:
res = openai.File.create(
    file=open("data.jsonl", "r"),
    purpose='fine-tune'
)
res

<File file id=file-FRR5gOUo1kMTOrGpX5GmU7Ge at 0x79c1f5020e50> JSON: {
  "object": "file",
  "id": "file-FRR5gOUo1kMTOrGpX5GmU7Ge",
  "purpose": "fine-tune",
  "filename": "file",
  "bytes": 225670,
  "created_at": 1693993204,
  "status": "uploaded",
  "status_details": null
}

In [363]:
file_id = res["id"]
file_id

'file-FRR5gOUo1kMTOrGpX5GmU7Ge'

In [372]:
res = openai.FineTuningJob.create(training_file=file_id, model="davinci-002")
res

<FineTuningJob fine_tuning.job id=ftjob-Lu3fVNuA16G8CUxaryAWQbuv at 0x79c1f5005530> JSON: {
  "object": "fine_tuning.job",
  "id": "ftjob-Lu3fVNuA16G8CUxaryAWQbuv",
  "model": "davinci-002",
  "created_at": 1693993609,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-NvHPb9ySxZah9npepkQbdNAd",
  "result_files": [],
  "status": "created",
  "validation_file": null,
  "training_file": "file-FRR5gOUo1kMTOrGpX5GmU7Ge",
  "hyperparameters": {
    "n_epochs": 3
  },
  "trained_tokens": null
}

In [373]:
job_id = res["id"]
job_id

'ftjob-Lu3fVNuA16G8CUxaryAWQbuv'

In [374]:
openai.FineTuningJob.retrieve(job_id)

<FineTuningJob fine_tuning.job id=ftjob-Lu3fVNuA16G8CUxaryAWQbuv at 0x79c1f4e9a2a0> JSON: {
  "object": "fine_tuning.job",
  "id": "ftjob-Lu3fVNuA16G8CUxaryAWQbuv",
  "model": "davinci-002",
  "created_at": 1693993609,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-NvHPb9ySxZah9npepkQbdNAd",
  "result_files": [],
  "status": "running",
  "validation_file": null,
  "training_file": "file-FRR5gOUo1kMTOrGpX5GmU7Ge",
  "hyperparameters": {
    "n_epochs": 3
  },
  "trained_tokens": null
}

In [375]:
openai.FineTuningJob.list_events(id=job_id)

<OpenAIObject list at 0x79c1f4e98ae0> JSON: {
  "object": "list",
  "data": [
    {
      "object": "fine_tuning.job.event",
      "id": "ftevent-h0g5VQk3hthNH5kGz51rNtM5",
      "created_at": 1693993614,
      "level": "info",
      "message": "Fine tuning job started",
      "data": null,
      "type": "message"
    },
    {
      "object": "fine_tuning.job.event",
      "id": "ftevent-HwgZt89CHrwKyJCfePq1qHxw",
      "created_at": 1693993609,
      "level": "info",
      "message": "Created fine-tune: ftjob-Lu3fVNuA16G8CUxaryAWQbuv",
      "data": null,
      "type": "message"
    }
  ],
  "has_more": false
}

In [376]:
from time import sleep

while True:
    res = openai.FineTuningJob.retrieve(job_id)
    if res["finished_at"] != None:
        break
    else:
        print(".", end="")
        sleep(100)

........

In [377]:
res

<FineTuningJob fine_tuning.job id=ftjob-Lu3fVNuA16G8CUxaryAWQbuv at 0x79c1f50f32e0> JSON: {
  "object": "fine_tuning.job",
  "id": "ftjob-Lu3fVNuA16G8CUxaryAWQbuv",
  "model": "davinci-002",
  "created_at": 1693993609,
  "finished_at": 1693994337,
  "fine_tuned_model": "ft:davinci-002:personal::7vjbGMEI",
  "organization_id": "org-NvHPb9ySxZah9npepkQbdNAd",
  "result_files": [
    "file-GguTHpRmedLfirxNQVHhyNKK"
  ],
  "status": "succeeded",
  "validation_file": null,
  "training_file": "file-FRR5gOUo1kMTOrGpX5GmU7Ge",
  "hyperparameters": {
    "n_epochs": 3
  },
  "trained_tokens": 141018
}

In [378]:
ft_model = res["fine_tuned_model"]
ft_model

'ft:davinci-002:personal::7vjbGMEI'

In [379]:
ft_model = 'ft:davinci-002:personal::7vjbGMEI'

FEED JOURNAL DATA TO PINECONE

In [None]:
# import requests

# res = requests.get('https://raw.githubusercontent.com/pinecone-io/examples/master/learn/generation/openai/fine-tuning/gpt-3.5-agent-training/chains.py')
# with open("chains.py", 'w') as fp:
#     fp.write(res.text)

INITIALIZE LANGCHAIN AGENT FOR CHAT SESSION

In [413]:
from langchain.agents import Tool
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferWindowMemory
# from chains import VectorDBChain

llm = OpenAI(
    temperature=0.0,
    model_name=ft_model
)

# llm = ChatOpenAI(
#     temperature=0.5,
#     model_name=ft_model
# )

# memory = ConversationBufferWindowMemory(
#     memory_key="chat_history",
#     k=5,
#     return_messages=True,
#     output_key="output"
# )
# # app.pinecone.io
# vdb = VectorDBChain(
#     index_name="llama-2-arxiv-papers",
#     environment=os.getenv("PINECONE_ENV") or "YOUR_ENV",
#     pinecone_api_key=os.getenv("PINECONE_API_KEY") or "YOUR_KEY"
# )

# vdb_tool = Tool(
#     name=vdb.name,
#     func=vdb.query,
#     description="This tool allows you to get research information about LLMs."
# )

In [420]:
output = llm("What is the name of your roommate?")

In [421]:
output

'My roommate is Chetana. She is a very nice person and I am glad to have her as a roommate. We have a lot of fun together and I am looking forward to spending the next few days with her. She is also very supportive and understanding, which makes her a great roommate. I hope we can stay together for a long time. I miss her already. I wish I could spend more time with her. I miss her already. I wish I could spend more time with her. I miss her already. I wish I could spend more time with her. I miss her already. I wish I could spend more time with her. I miss her already. I wish I could spend more time with her. I miss her already. I wish I could spend more time with her. I miss her already. I wish I could spend more time with her. I miss her already. I wish I could spend more time with her. I miss her already. I wish I could spend more time with her. I miss her already. I wish I could spend more time with her. I miss her already. I wish I could spend more time with her. I miss her alrea

In [313]:
output

'I felt bad. I felt like I had let the team down. I felt like I had let myself down. I felt like I had let my family down. I felt like I had let my country down. I felt like I had let my coach down. I felt like I had let my fans down. I felt like I had let my teammates down. I felt like I had let my country down. I felt like I had let my coach down. I felt like I had let my fans down. I felt like I had let my teammates down. I felt like I had let my country down. I felt like I had let my coach down. I felt like I had let my fans down. I felt like I had let my teammates down. I felt like I had let my country down. I felt like I had let my coach down. I felt like I had let my fans down. I felt like I had let my teammates down. I felt like I had let my country down. I felt like I had let my coach down. I felt like I had let my fans down. I felt like I had let my teammates down. I felt like I had let my country down. I felt like I had let my coach down. I felt like'

In [None]:
# from langchain.agents import AgentType, initialize_agent

# agent = initialize_agent(
#     agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
#     tools=[vdb_tool],
#     llm=llm,
#     verbose=True,
#     max_iterations=3,
#     early_stopping_method="generate",
#     memory=memory,
#     return_intermediate_steps=True
# )

In [367]:
openai.Model.delete(ft_model)

<Model model id=ft:davinci-002:personal::7vi4Ysy1 at 0x79c208949850> JSON: {
  "id": "ft:davinci-002:personal::7vi4Ysy1",
  "object": "model",
  "deleted": true
}