## Installing Required Dependencies

In [None]:
!pip install --user openai

In [None]:
!pip install --user python-dotenv

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, roc_curve, auc, classification_report

import os
import dotenv
import time

import openai

## Loading OpenAI API Key

In [2]:
dotenv.load_dotenv('../env/.env')

True

In [3]:
openai.organization = os.getenv("OPENAI_ORG_ID")
openai.api_key = os.getenv("OPENAI_API_KEY")

## Prompt Engineering and Getting API Result

In [7]:
def ask_gpt(q1, q2, max_retries=2):
    retries = 0
    response = None

    while retries < max_retries:
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {
                    "role": "system",
                    "content": "You are tasked to look at question pairs from the platform Quora and decide if they are duplicates or not."
                    },
                    {
                    "role": "user",
                    "content": "The following are two questions from Quora, the social question-and-answer website." +
                                f"\n\nQuestion 1:\n\"{q1}\"\n\nQuestion 2:\n\"{q2}\"\n\nThink through both the questions above, " +
                                 "understand the semantics of these two questions and decide if they are semantically similar to each other or not, " +
                                "i.e. decide if they are duplicates. You do not need to be very strict;  it is fine for Question 1 and Question 2 to contain " +
                                "slight differences in semantics and be considered as not duplicates.\n\nIf they are duplicates, your response should be a \"1\"." +
                                "If they are not duplicates, your response should be a \"0\". \n\nIt suffices to just respond with a single value \"1\" or \"0\"."
                    }
                ],
                temperature=1,
                max_tokens=1,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0
            )
            response = int(response['choices'][0]["message"]["content"])
            break
        except Exception as e:
            print(e)
            retries += 1
            if retries >= max_retries:
                response = None
            else:
                delay = 2 ** retries
                print(f"Retrying in {delay} seconds")
                time.sleep(delay)
    return response

## Loading Dataset

In [8]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


### Null removals

In [9]:
print(f"initial dataframe has {len(df)} rows.")
null_rows = df[df.isnull().values.any(1)]
print(f"dataframe has {len(null_rows)} null rows.")
df.dropna(how='any', inplace=True)
print(f"dataframe has {len(df)} rows after removing null values.")

initial dataframe has 404290 rows.
dataframe has 3 null rows.
dataframe has 404287 rows after removing null values.


In [10]:
num_samples = len(df)
num_samples

404287

### Splitting

In [12]:
X = df.iloc[:,:-1]
y = df.iloc[:, -1]

In [13]:
# split into train/val, test sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# split into train and val sets
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

In [14]:
X_train.shape, X_val.shape, X_test.shape

((258743, 5), (64686, 5), (80858, 5))

## Asking GPT

In [15]:
gpt_responses = []

for index, row in tqdm(X_test.iterrows()):
    q1 = row['question1']
    q2 = row['question2']

    response = ask_gpt(q1, q2)
    gpt_responses.append(response)
    
    time.sleep(1)

0it [00:00, ?it/s]

Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Retrying in 2 seconds
Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Retrying in 2 seconds
Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Retrying in 2 seconds
Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Retrying in 2 seconds
Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Retrying in 2 seconds
Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Retrying in 2 seconds
Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Retrying in 2 seconds
Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (

KeyboardInterrupt: 

## Evaluation (Benchmark)

In [41]:
labels = y_test.tolist()
accuracy = accuracy_score(gpt_responses, labels[:385])
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 72.47%
