<a href="https://colab.research.google.com/github/fazlicodes/ADNOC_NLP_QuestionAndAnswering_System/blob/main/BardAPI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initializations

In [None]:
pip install -q google-generativeai

In [None]:
!pip install openai
%env OPENAI_API_KEY= #Your API Key
import pandas as pd
import openai

In [None]:
import pprint
import google.generativeai as palm
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
palm.configure(api_key=) #your API key

In [None]:
models = [m for m in palm.list_models() if 'generateText' in m.supported_generation_methods]
model = models[0].name
print(model)

# Q&A Generation

In [None]:
df = pd.read_csv('/content/clean_context_gpt3_5.csv')


In [None]:
def get_questions(context, temp=0):
    try:
        prompt = f"in different question styles (how, where, when, etc.) Write 30 of informative questions (without answers) based on the drill report below\n\nText: {context}\n\nQuestions:\n1."
        completion = palm.generate_text(
            model=model,
            prompt=prompt,
            temperature=temp,
            # The maximum length of the response
            max_output_tokens=4000,
        )
        return completion.result
    except:
        return ""

In [None]:
df['questions2']= df['clean_gpt3.5'].apply(get_questions, temp=1)
df['questions2'] = "1." + df.questions2
print(df[['questions2']].values[0][0])

In [None]:
def get_answers(df, temp=0):
    try:
        prompt=f"Write the answer of the following questions based on the drill report below\n\nText: {df.clean_gpt}\n\nQuestions:\n{df.questions2}\n\nAnswers:\n1."
        completion = palm.generate_text(
            model=model,
            prompt=prompt,
            temperature=temp,
            # The maximum length of the response
            max_output_tokens=4000,
        )
        return completion.result
    except Exception as e:
        print(e)
        return ""

In [None]:
df["clean_gpt"] = df["clean_gpt3.5"]

In [None]:
df['answers2']= df.apply(get_answers, axis=1, temp=0)

In [None]:
df.loc[:,["clean_gpt","questions2","answers2"]].to_csv('/content/q&a_exp6_temp1_cleancontext.csv', index=False)

In [None]:
print(get_answers("what was going on at hour 21:00? and what does that mean?",context, temp = 0))

# Question filtering

In [None]:
pip install datasketch


In [None]:
!pip install nltk

In [None]:
df = pd.read_csv('/content/q&a_exp6_temp1_cleancontext.csv')
questions_combined = '\n'.join(df['questions2'].dropna())

In [None]:
qList = questions_combined.splitlines()

# If the index is separated by a space:
qList = [question.split('. ', 1)[1] if '. ' in question else question.split('.', 1)[1] for question in qList]
qList = list(set(qList))


In [None]:
from datasketch import MinHash, MinHashLSH
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stopwords_list = set(stopwords.words('english'))

additional_stopwords = {"is", "the", "what", "are", "how"}
stopwords_list.update(additional_stopwords)

def compute_minhash(text):
    m = MinHash(num_perm=128)
    for word in text.split():
        if word.lower() not in stopwords_list:
            m.update(word.encode('utf8'))
    return m

# Compute MinHash for each question
minhashes = [(i, compute_minhash(question)) for i, question in enumerate(qList)]

# Create LSH index
lsh = MinHashLSH(threshold=0.7, num_perm=128)
for idx, minhash in minhashes:
    lsh.insert(str(idx), minhash)

# Find groups of similar questions
similar_question_groups = []
seen = set()
for idx, minhash in minhashes:
    if str(idx) not in seen:
        similar_questions = lsh.query(minhash)
        similar_questions = [q for q in similar_questions if q not in seen]
        seen.update(similar_questions)
        similar_question_groups.append(similar_questions)


In [None]:
def isSimilar(Q1,Q2):
  prompt = f"does these two questions have the same meaning: \n Q1 \n {Q1} \n Q2 \n {Q2}?"
  completion = palm.generate_text(
      model=model,
      prompt=prompt,
      temperature=0,
      # The maximum length of the response
      max_output_tokens=400,
  )
  if completion.result.lower() == "yes":
    return True
  else:
    return False


In [None]:

def saveQuestionsToFile(questions, filename):
    with open(filename, "w") as f:
        for question in questions:
            f.write(question + "\n")

def loadQuestionsFromFile(filename):
    with open(filename, "r") as f:
        questions = f.read().splitlines()
    return questions


#/content/drive/MyDrive/

In [None]:
def isSimilar2(Q1,Q2):
  response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo-16k-0613",
    messages=[{ "role": "system", "content": "You are a similarity detector, if two questions are similar you print 'yes' and if they are different you print 'no'." },{"role": "user", "content": f"does these two questions have the same meaning: \n Q1 \n {Q1} \n Q2 \n {Q2}?"}])
  if response['choices'][0]["message"]["content"] == "yes":
    return True
  else:
    return False


In [None]:
def getUniqueQuestions(qList):
    uniqueQuestions = set()
    checkedPairs = set()

    for i in range(len(qList)):
        isUnique = True

        for j in range(i+1, len(qList)):
            # Skip pair if it's already been checked
            if (i, j) in checkedPairs or (j, i) in checkedPairs:
                continue

            if isSimilar2(qList[i], qList[j]):
                isUnique = False

            checkedPairs.add((i, j))

        if isUnique:
            uniqueQuestions.add(qList[i])

    return list(uniqueQuestions)


In [None]:
import time

#uniqueList = getUniqueQuestions(qList2)
# Apply your unique question finding algorithm to similar question groups
unique_questions = []
for group in similar_question_groups:
    group_questions = [qList[int(i)] for i in group]
    print(group_questions)
    unique_group_questions = getUniqueQuestions(group_questions)
    unique_questions.extend(unique_group_questions)
# Save questions to file
saveQuestionsToFile(unique_questions, "/content/drive/MyDrive/unique_questions2.txt")

In [None]:
# Load the questions back from the file
loaded_questions = loadQuestionsFromFile("/content/drive/MyDrive/unique_questions.txt")

In [None]:
loaded_questions