In [2]:
!pip install openai
!pip install tiktoken
!pip install transformers

Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.8
Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.4.0
Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-no

In [3]:
# imports
import ast  # for converting embeddings saved as strings back to arrays
import openai  # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
from scipy import spatial  # for calculating vector similarities for search


# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo-16k"

In [4]:
df = pd.read_csv('/content/clean_context_gpt3_5.csv')

### 0. Cleaning data with DaVinci



---



In [None]:
def get_clean_version(context, temp=0):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "system", "content": f"Clean the following document from an oil drilling process report to feed a large language model considering punctuations, grammar, lowercase and uppercase, eliminate repetitive information, and add english connector words where necessary.\nText: {context}"}]
        )
        return response['choices'][0]['message']['content']
    except Exception as e:
        print(e)
        return ""

### 1. Embedding the context

---



In [9]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

In [10]:
df['tokens'] = df['clean_gpt3.5'].apply(num_tokens)

In [11]:
df['tokens'].describe()

count      74.000000
mean      739.324324
std       234.513960
min       428.000000
25%       563.750000
50%       704.500000
75%       858.250000
max      1464.000000
Name: tokens, dtype: float64

In [12]:
# calculate embeddings
EMBEDDING_MODEL = "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr 2023
BATCH_SIZE = 10  # you can submit up to 2048 embedding inputs per request

embeddings = []
for batch_start in range(0, len(df['clean_gpt3.5']), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = df['clean_gpt3.5'][batch_start:batch_end].to_list()
    print(f"Batch {batch_start} to {batch_end-1}")
    response = openai.Embedding.create(model=EMBEDDING_MODEL, input=batch)
    for i, be in enumerate(response["data"]):
        assert i == be["index"]  # double check embeddings are in same order as input
    batch_embeddings = [e["embedding"] for e in response["data"]]
    embeddings.extend(batch_embeddings)

Batch 0 to 9
Batch 10 to 19
Batch 20 to 29
Batch 30 to 39
Batch 40 to 49
Batch 50 to 59
Batch 60 to 69
Batch 70 to 79


In [13]:
df_embeddings_clean = pd.DataFrame({"text": df['clean_gpt3.5'], "embedding": embeddings})

In [14]:
df_embeddings_clean.to_csv('/content/embeddings_clean.csv')
#df['embedding'] = df['embedding'].apply(ast.literal_eval)

In [15]:
df_embeddings_clean

Unnamed: 0,text,embedding
0,"On day January 1st, 2021, the well name is ""Fo...","[-0.02839006297290325, 3.726646173163317e-05, ..."
1,elevation of 5415.1 on the drill floor square....,"[-0.01966121792793274, 0.005241795442998409, -..."
2,"On March 1st, 2021, the well named FORGE 16A [...","[-0.024614760652184486, -0.0023899213410913944..."
3,"On day 10/22/2020, the well name is FORGE 16A ...","[-0.005067897029221058, -0.011108830571174622,..."
4,"On Day 10/23/2020, the well name is FORGE 16A ...","[0.001314230146817863, -0.0017932062037289143,..."
...,...,...
69,"On December 27, 2020, the oil drilling process...","[-0.028744708746671677, 0.0017169336788356304,..."
70,"On day 12/28/2020, the well named Forge 16A [7...","[-0.023209940642118454, -0.002023211447522044,..."
71,"On Day 12/29/2020, the Well Name is FORGE 16A ...","[-0.030478468164801598, -0.00784224458038807, ..."
72,"On December 30, 2020, the well named ""FORGE 16...","[-0.022721389308571815, 0.001812267117202282, ..."


In [16]:
# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 10
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [19]:
strings, relatednesses = strings_ranked_by_relatedness('What is the name of the well on 11/23/2020?', df_embeddings_clean, top_n=5)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)

relatedness=0.823


"On Day 11/29/2020, the Well Name is FORGE 16A [78]-32 and the Rig supervisor is Leroy S, Paul S, Bob F, Duane W. The current operation is drilling at an elevation of 7,294', with an elevation of 5415.1 on the Drill Floor Square. The Directional Drilling is 40 and the Measured Depth is nan. The planned activity is to drill tangent, TOOH, and run reaming assembly from 06:00 to 06:30 for 0.5 hours. The phase is Production Drilling and the code is Other. The operation is to complete laying down BHA #22. \n\nThen, from 06:30 to 08:30 for 2.0 hours, the operation is to pick up BHA #23 to drill the curve. After that, from 08:30 to 10:00 for 1.5 hours, the operation is to trip in the hole with BHA #23 to 3,000'. From 10:00 to 10:30 for 0.5 hours, the operation is to circulate for temperature at 3,000'. \n\nNext, from 10:30 to 12:00 for 1.5 hours, the operation is to trip in the hole with BHA #23 to 5,206'. From 12:00 to 12:30 for 0.5 hours, the operation is to circulate for temperature at 5,2

relatedness=0.816


'On day 11/23/2020, the well name is Forge 16A [78]-32. The rig supervisor is Duane Winkler, Leroy Swearingen, Paul Stoud. The current operation is coring, with an elevation of 5415.1. The drill floor square is 24.0. The directional drilling is at 34. The measured depth is not available.\n\nThe planned activity is to complete coring, run wireline seismic shot, and pick up curve assembly from 06:00 to 06:30 for 0.5 hours. This phase is part of the production drilling code, specifically coring. Before the operation begins, there will be a pre-job safety meeting (PJSM).\n\nDuring the coring operation, the well will be drilled from 5,855\' to 5,856\' with a weight on bit (wob) of 8-9K, a rate of penetration (rom) of 40, and a gallons per minute (gpm) of 350. This will take place from 06:30 to 11:30 for 5.0 hours.\n\nAfter retrieving 8\' of core from drilling 10\' (5,846\' to 5,856\'), the total core recovered is 26\'.\n\nFrom 11:30 to 13:30 for 2.0 hours, a rig service will be conducted as

relatedness=0.815


'On Day 11/22/2020, the well name is FORGE 16A [78]-32. The rig supervisor is Duane Winkler, Leroy Swearingen, and Paul Stoud. The current operation is coring, with an elevation of 5415.1 and a drill floor square of 23.0. The directional drilling is at 33, and the measured depth is unknown. The planned activity for the day is coring. \n\nFrom 06:00 to 16:00, there is a ten-hour phase of production drilling with the code "Drilling." The operation during this time is PJSM (pre job safety meeting). \n\nThe next step is to drill ahead from 5,793\' to 5,846\' using BHA #14, with a length of 53\' and a speed of 5.3 FPH. \n\nAt 5,800\', a survey check shot is taken, with an inclination of 1.89°. \n\nAfter making a connection at 5,846\', the rate of penetration (ROP) dropped to 1.6 FPH. Therefore, it was decided to trip out of the hole to conduct coring. \n\nDuring this process, the parameters are set at 30K WOB, 35 RPM rotary, 145 psi differential, SPP 2,770 psi, 602 gpm, and 3,900-4,100 torq

relatedness=0.815


'On Day 11/12/2020, the Well Name is FORGE 16A [78]-32. The Rig supervisor is Bob Frank, Duane Winkler, and Virgil Welch. The current operation is circulating and rig down casing running equipment. The elevation is 5415.1, and the Drill Floor square is 13.0. The Directional Drilling is 23, but the Measured Depth is not specified.\n\nThe planned activities include circulating, running cementers, cementing the well, draining the stack, and washing out the BOP. Additionally, the rig will be taking down the flow line to prepare for the installation of the well head. This will be done from 06:00 to 10:30, which will take approximately 4.5 hours. The phase for this operation is Intermediate Drilling, and the code is Other.\n\nAfter that, from 10:30 to 14:00 (approximately 3.5 hours), the operation will involve completing the installation of pump #2 module. The phase remains Intermediate Drilling, but the code changes to Trips.\n\nFrom 14:00 to 15:00 (1.0 hour), the operation involves picking

relatedness=0.814


"On Day 11/28/2020, the well name is Forge 16A [78]-32. The rig supervisor is Leroy S, Paul S, Bob F, and Duane W. The current operation is a trip out of the hole with BHA 22. The elevation is 5415.1 and the drill floor square is 29.0. The directional drilling is 39. The measured depth is not available. The planned activity is to pick BHA 23 and then trip in the hole to drill the cure. This activity will take place from 06:00 to 17:30, for a total of 11.5 hours. The phase of the operation is production drilling, and the code is drilling. Before starting the operation, a pre-job safety meeting (PJSM) will be conducted.\n\nDuring the operation, the drill will rotate and slide from 6,724' to 6,913' with a distance of 189'. The footage per hour (FPH) is 16.4. The weight on bit (WOB) will range from 40K-60K, the rotation per minute (rpm) will be 50, and the gallons per minute (gpm) will be 558.\n\nAfter completing the trip out of the hole, from 17:30 to 18:00, a phase change will occur. The

In [20]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Use the below reports on oil drilling to answer the subsequent question. If the answer cannot be found in the reports, write "I could not find an answer. Rephrase your question."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\nContext:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question


def ask(
    query: str,
    df: pd.DataFrame = df_embeddings_clean,
    model: str = GPT_MODEL,
    token_budget: int = 15000 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about the oil drilling reports."},
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    return response_message

### Just checking

#### 1.

##### New experiment

In [None]:
df_embeddings_clean['text'][14]

'On Day 11/02/2020, the Well Name is FORGE 16A [78]-32 and the Rig supervisor is Leroy Swearingen, Bob Frank, and Duane C Winkler. The current operation is Nipple UP BOPE, and the Elevation is 5415.1. The Drill Floor Square is 3.0 and the Directional Drilling is 13. The Measured Depth is not available (nan). The planned activity is to Complete Nipple UP BOPE and Install Rotating Head from 06:00 to 07:00 for 1.0 hour. The phase for this activity is Surface Casing and the Code is Other. Prior to the operation, a PJSM (pre job safety meeting) will be conducted with FORGE DSM\'s and Frontier Drilling.\n\nThe next task is to prepare the cellar for wellhead installation. This will be done from 07:00 to 08:30 for 1.5 hours. The phase for this task is Surface Casing and the Code is Nipple Up B.O.P.. The operation for this task is to Cut conductor and make the initial cut on the 13 3/8" casing and remove it from the sub base.\n\nFollowing that, from 08:30 to 14:00 for 5.5 hours, the operation i

In [None]:
ask('Who are the supervisors on 11/02/2020?')

'The supervisors on 11/02/2020 are Leroy Swearingen, Bob Frank, and Duane C Winkler.'

In [None]:
ask('Tell me the activity planned on 11/02/2020.')

'The planned activity on 11/02/2020 is to complete the Nipple UP BOPE and install the rotating head. This activity is scheduled to take place from 06:00 to 07:00 for a duration of 1.0 hour. The phase for this activity is Surface Casing, and the code is Other.'

In [None]:
ask('Tell me the name of the well on 11/02/2020.')

'The well name on 11/02/2020 is FORGE 16A [78]-32.'

##### Old experiment:

In [None]:
df_embeddings['text'][14]

'On Day  11/02/2020 Well Name is  FORGE 16A [78]-32 Rig supervisor is  Leroy Swearingen , Bob Frank, Duane C Winkler. PRESENT operation is  Nipple UP BOPE ELEVAtion is 5415.1 Drill Floor Square is 3.0 Directional Drilling is 13 Measured Depth is nan. Activity planned is  Complete Nipple UP BOPE, Install Rotating Head, From 06:00 to 07:00 For 1.0 Hours Phase is Surface Casing Code is Other. Operation is PJSM (pre job safety meeting) with FORGE DSM\'s and Frontier Drilling.\nPrepare cellar to install wellhead. And From 07:00 to 08:30 For 1.5 Hours Phase is Surface Casing Code is Nipple Up B.O.P.. Operation is Cut conductor and initial cut on 13 3/8" casing and remove from sub base. And From 08:30 to 14:00 For 5.5 Hours Phase is Surface Casing Code is Nipple Up B.O.P.. Operation is Make final cut on 13 3/8" casing and set on wellhead.\nWeld on wellhead performing preheat and post heat requirements as procedure.\nTest to 1,500 psi for 10 minutes and 30 minutes.  Tests were good. And From 1

In [None]:
ask('Who are the supervisors on 11/02/2020?')

'On 11/02/2020, the supervisors are Leroy Swearingen, Bob Frank, and Duane Winkler.'

In [None]:
ask('What was the activity planned on 11/02/2020?')

'I could not find an answer.'

In [None]:
ask('What is the name of the well on 11/02/2020?')

'I could not find an answer.'

#### 2.

###### New experiment:

In [None]:
df_embeddings_clean['text'][30]

'On November 18, 2020, the well name is FORGE 16A [78]-32. The rig supervisor is Duane Winkler, Leroy Swearingen, and Virgil Welch. The current operation is tripping in the hole with BHA #12 core assembly. The elevation is 5415.1, and the drill floor square is 19.0. The directional drilling is 29, and the measured depth is not available.\nThe planned activity is to cut the core from 5,495\' from 06:00 to 07:00 for 1.0 hour. This phase is production drilling, and the code is other. The operation is PJSM, which stands for pre-job safety meeting.\nNext, we need to prepare the rig floor for core tools and stage the core tools to the rig floor. This will be done from 07:00 to 09:00 for 2.0 hours. Again, this is a production drilling phase with the code other. The operation is to pick up and make up a 7" core barrel to cut a 4" core.\nFrom 09:00 to 11:00, we will be performing a trip in the hole with BHA #11. This will take 2.0 hours and is classified as production drilling with the code tri

In [None]:
ask('What is the present operation on 11/18/2020?')

'On November 18, 2020, the present operation is tripping in the hole with BHA #12 core assembly.'

In [None]:
ask('What is the activity planned on 11/18/2020?')

"The activity planned on 11/18/2020 is to cut the core from 5,495' from 06:00 to 07:00 for 1.0 hour. This phase is classified as production drilling, and the code is other."

In [None]:
ask('What does PJSM stand for?')

'PJSM stands for Pre Job Safety Meeting.'

###### Old experiment:

In [None]:
df_embeddings['text'][30]

'On Day  11/18/2020 Well Name is  FORGE 16A [78]-32 Rig supervisor is  Duane Winkler, Leroy Swearingen, Virgil Welch. PRESENT operation is  TRIPPING IN HOLE WITH BHA #12 CORE ASSEMBLY ELEVAtion is 5415.1 Drill Floor Square is 19.0 Directional Drilling is 29 Measured Depth is nan. Activity planned is  CUT CORE FROM 5,495\' From 06:00 to 07:00 For 1.0 Hours Phase is Production Drilling Code is Other. Operation is PJSM, pre job safety meeting\nPrepare rig floor for core tools\nStage core tools to rig floor And From 07:00 to 09:00 For 2.0 Hours Phase is Production Drilling Code is Other. Operation is Pick up make up 7" core barrel to cut 4" core And From 09:00 to 11:00 For 2.0 Hours Phase is Production Drilling Code is Trips. Operation is Trip in hole BHA  # 11 And From 11:00 to 11:30 For 0.5 Hours Phase is Production Drilling Code is Cond Mud & Circ. Operation is Fill pipe and circulate one bottom up And From 11:30 to 12:30 For 1.0 Hours Phase is Production Drilling Code is Rig Service. O

In [None]:
ask('What is the present operation on 11/18/2020?')

'I could not find an answer.'

In [None]:
ask('What is the activity planned on 11/18/2020?')

"On 11/18/2020, the activity planned is to cut the core from 5,495'."

In [None]:
ask('What does PJSM stand for?')

'PJSM stands for Pre Job Safety Meeting.'

#### 3.

In [None]:
df_embeddings_clean['text'][35]

'On day 11/23/2020, the well name is Forge 16A [78]-32. The rig supervisor is Duane Winkler, Leroy Swearingen, Paul Stoud. The current operation is coring, with an elevation of 5415.1. The drill floor square is 24.0. The directional drilling is at 34. The measured depth is not available.\n\nThe planned activity is to complete coring, run wireline seismic shot, and pick up curve assembly from 06:00 to 06:30 for 0.5 hours. This phase is part of the production drilling code, specifically coring. Before the operation begins, there will be a pre-job safety meeting (PJSM).\n\nDuring the coring operation, the well will be drilled from 5,855\' to 5,856\' with a weight on bit (wob) of 8-9K, a rate of penetration (rom) of 40, and a gallons per minute (gpm) of 350. This will take place from 06:30 to 11:30 for 5.0 hours.\n\nAfter retrieving 8\' of core from drilling 10\' (5,846\' to 5,856\'), the total core recovered is 26\'.\n\nFrom 11:30 to 13:30 for 2.0 hours, a rig service will be conducted as

In [None]:
ask('What is the measured depth available on 11/23/2020?')

'The measured depth is not available on 11/23/2020.'

In [None]:
ask('What is the activity planned on 11/23/2020?')

'On 11/23/2020, the planned activity is to trip out of the hole and run wireline seismic shot.'

In [None]:
ask(f'Who are the rig supervisors on 11/23/2020?')

'The rig supervisors on 11/23/2020 are Duane Winkler, Leroy Swearingen, and Paul Stoud.'

In [None]:
df_embeddings['text'][35]

'On Day  11/23/2020 Well Name is  FORGE 16A [78]-32 Rig supervisor is  Duane Winkler, Leroy Swearingen, Paul Stoud. PRESENT operation is  CORING ELEVAtion is 5415.1 Drill Floor Square is 24.0 Directional Drilling is 34 Measured Depth is nan. Activity planned is  COMPLETE CORING, RUN WIRELINE SEISMIC SHOT, PICK UP CURVE ASSEMBLY From 06:00 to 06:30 For 0.5 Hours Phase is Production Drilling Code is Coring. Operation is PJSM, pre job safety meeting \nCore from 5,855\' to 5,856\'.  8-9K wob, 40 rom, 350 gpm. And From 06:30 to 11:30 For 5.0 Hours Phase is Production Drilling Code is Trips. Operation is Trip out of hole with BHA #15, core BHA due to slow penetration.\nLay down core barrels.  Drilled 10\' (5,846\' to 5,856\') and recovered 8\' of core.\nTotal core recovered is 26\'. And From 11:30 to 13:30 For 2.0 Hours Phase is Production Drilling Code is Other. Operation is 1 hour rig service.\nPreparing BHA for clean out run. And From 13:30 to 18:00 For 4.5 Hours Phase is Production Drill

In [None]:
ask('What is the activity planned on 11/23/2020?')

'I could not find an answer.'

In [None]:
ask(f'Who are the rig supervisors on 11/23/2020?')

'I could not find an answer.'

#### New

In [None]:
df_embeddings_clean['text'][0]

'On day January 1st, 2021, the well name is "Forge 16A [78]-32". The rig supervisor is Leroy S, Paul S, Bob F, and Duane W. The current operation is rigging up Wyoming casing with an elevation of 5415.1 and a drill floor square of 63.0. The directional drilling is at 73 degrees with a measured depth of 10987.0. \n\nThe planned activity is to break off the hanger assembly and check for the setting ball. The next step is to continue running casing to a depth of 10,787\'. From 06:00 to 07:30, for 1.5 hours, the phase is production casing, and the code is to run casing and cement. The operation is to run 126 joints of 7\' x 38 ppf, T95 JFELION casing to a depth of 5,936\'. The casing should be rotated at 10 rpm with a torque of 6K. \n\nFrom 07:30 to 08:30, for 1.0 hour, the phase is production casing, and the code is to run casing and cement. The operation is to rig down CRT and casing equipment, pick up/slack off 220K. \n\nFrom 08:30 to 09:30, for 1.0 hour, the phase is production casing,

In [None]:
ask('What is the name of the well on January 1st, 2021?')

'The name of the well on January 1st, 2021, is "Forge 16A [78]-32".'

In [None]:
ask('How much should the casing be rotated on January 1st, 2021?')

'On January 1st, 2021, the casing should be rotated at 10 rpm.'

In [None]:
ask('What is the planned activity on January 1st, 2021?')

"The planned activity on January 1st, 2021, is to break off the hanger assembly and check for the setting ball. The next step is to continue running casing to a depth of 10,787'."

In [None]:
ask('What was the phase from 06:00 to 07:30 on January 1st, 2021?')

'The phase from 06:00 to 07:30 on January 1st, 2021, was "production drilling" with the code "trips".'