<a href="https://colab.research.google.com/github/darshanja/shell_hackathon_2023/blob/main/shell_hackathon_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import openai
import pandas as pd
import numpy as np
import time

#### Read JSON data

In [None]:
df = pd.read_json('datafinal.json')
df.shape

In [None]:
df.isna().sum()

In [None]:
df.head()

### Setup Azure OpenAI GPT4

In [None]:
openai.api_type = ""
openai.api_base = ""
openai.api_version = ""
openai.api_key = ""

In [None]:
def use_gpt4(system_prompts, paragraph, temperature= 0):
    """
    This function uses the OpenAI API to generate natural language text using the GPT-4 model. It takes in two parameters:

    system_prompts: str
        The prompt given to the GPT-4 model to generate a response.

    paragraph: str
        The text that the GPT-4 model will use to generate a response to the system prompt.

    temperature: float, optional
        A value that controls the randomness and creativity of the generated text. A higher temperature will result in more
        creative and varied responses, while a lower temperature will result in more predictable and conservative responses.
        The default value is 0.

    Returns:
    str
        The generated text response from the GPT-4 model.
    """
    completion = openai.ChatCompletion.create(
        engine='gpt-4-32k',
        temperature=temperature,
        messages=[
            {
                'role': 'system',
                'content': system_prompt
            },
            {
                'role': 'user',
                'content': paragraph
            }
        ]
    )

    return completion['choices'][0]['message']['content']

In [None]:
def run_gpt4(df, system_prompts, temperature = 0):
    """
    This function generates natural language text using the GPT-4 model for each row in a Pandas DataFrame. It takes in three parameters:

    df: pandas.DataFrame
        The DataFrame containing the text data to be used as input for the GPT-4 model.

    system_prompts: str
        The prompt given to the GPT-4 model to generate a response.

    temperature: float, optional
        A value that controls the randomness and creativity of the generated text. A higher temperature will result in more
        creative and varied responses, while a lower temperature will result in more predictable and conservative responses.
        The default value is 0.

    Returns:
    list
        A list of generated text responses from the GPT-4 model for each row in the input DataFrame.
    """
    pred_lst = []
    for index, row in df.iterrows():
        pred = use_gpt4(system_prompts=system_prompts, paragraph = row['Text'], temperature= temperature)
        pred_lst.append(pred)
        time.sleep(2)

    return pred_lst

In [None]:
system_prompt = """Check if there is any code snippet of any language in the given paragraph, if you find any code,
just return the code else return No code found"""

In [None]:
%%time
code_pred_lst = run_gpt4(df=df, system_prompts=system_prompt, temperature = 0)

In [None]:
df['pred_gpt4'] = code_pred_lst
df.rename(columns={'CodeList':'CodeList_old'}, inplace=True)
df.rename(columns={'pred_gpt4':'CodeList'}, inplace=True)
df['CodeList'] = df['CodeList'].replace('No code found', '')
df.head()

In [None]:
df["CodeList"].fillna("",inplace=True) ## Incase if it contain any NaN values

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
s1 = df["CodeList"]
t1 = mlb.fit_transform(s1)

In [None]:
submission = pd.DataFrame(t1)
submission = submission.iloc[:,:93]
submission.to_csv("submission.csv",index=False)
submission