In [3]:
import pandas as pd
import parse_xml

In [88]:
path_posts = 'data/Posts.xml'

In [89]:
posts_df = parse_xml.posts_to_df(path_posts)

In [90]:
# keep only the columns we need

posts_df = posts_df[['Id', 'PostTypeId', 'ParentId', 'Body', 'AcceptedAnswerId']]

In [108]:
questions_df = posts_df[posts_df['PostTypeId'] == 1]
answers_df = posts_df[posts_df['PostTypeId'] == 2]

In [110]:
# remove all tags from the body of the posts using regex

questions_df['Body'] = questions_df.loc[:, 'Body'].str.replace(r'<[^>]*>', '', regex=True)
answers_df['Body'] = answers_df.loc[:, 'Body'].str.replace(r'<[^>]*>', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questions_df['Body'] = questions_df.loc[:, 'Body'].str.replace(r'<[^>]*>', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  answers_df['Body'] = answers_df.loc[:, 'Body'].str.replace(r'<[^>]*>', '', regex=True)


In [111]:
# remove the questions that have accepted answer id not 0

questions_df = questions_df[questions_df['AcceptedAnswerId'] != 0].set_index('Id')
answers_df = answers_df[answers_df['Id'].isin(questions_df['AcceptedAnswerId'])].set_index('Id')
print(questions_df.shape[0], answers_df.shape[0])

12014 12014


In [112]:
# make Id the index of the dataframes
questions_df = questions_df.drop(columns=['PostTypeId', 'ParentId'])
answers_df = answers_df.drop(columns=['PostTypeId', 'AcceptedAnswerId'])

In [247]:
# export the dataframes to csv files

questions_df.to_csv('data/questions.csv')
answers_df.to_csv('data/answers.csv')

In [None]:
# import the dataframes from the csv files

questions_df = pd.read_csv('data/questions.csv', index_col='Id')
answers_df = pd.read_csv('data/answers.csv', index_col='Id')

In [114]:
# use sententence-transformers to encode the questions

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

questions_df['embedding'] = questions_df.loc[:, 'Body'].apply(lambda x: model.encode(x))

In [115]:
 # save all the embeddings in faiss index

import faiss
import numpy as np

embeddings = np.array(questions_df['embedding'].tolist())
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

In [248]:
# save the index to a file

faiss.write_index(index, 'faiss_index')

In [249]:
# read the index from the file

index = faiss.read_index('faiss_index')

In [256]:
def answer_question(question, n=5, index=index, questions=questions_df, answers=answers_df, model=model):
    # Encode the question to get its embedding, ensuring it's in the correct shape
    question_embedding = model.encode([question]).reshape(1, -1)
    # Search the index for the n closest questions
    D, I = index.search(question_embedding, n)
    
    # Get the IDs of the similar questions
    question_ids = questions.iloc[I[0]].index
    
    # Prepare lists to hold the similar questions and their answers
    similar_questions = []
    similar_answers = pd.DataFrame()
    
    # Loop through each similar question ID to get the question text and its answers
    for q_id in question_ids:
        similar_questions.append(questions.loc[q_id].Body)  # Assuming there's a QuestionText column
        q_answers = answers[answers['ParentId'] == q_id]
        similar_answers = pd.concat([similar_answers, q_answers], ignore_index=True)
    
    return question, similar_questions, similar_answers

In [257]:
new_question, similar_questions, similar_answers = answer_question('Are are the methods of imputation in statistics?')

In [211]:
chatgpt_key = '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'

In [246]:
content = "Answer to the question shortly by using the information given in these 5 answers. If the information is not there, say that you dont know"
for i, answer in enumerate(similar_answers['Body']): # Assumes a list of answers
    content += f"Start of Answer {i+1}: {answer}\n End of Answer {i+1}\n\n\n"

In [244]:
from openai import OpenAI

client = OpenAI(api_key=chatgpt_key)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "assistant",
            "content": content,
        }
    ],
    model="gpt-3.5-turbo",
    temperature=0.0,
    max_tokens=150,
)


In [253]:
chat_completion.choices[0].message.content

'To handle missing data, you can consider visualizing the missing values in your dataset using libraries like missingno. Depending on the nature of your data and the missingness pattern (MCAR, MAR, MNAR), you can choose between deletion methods (listwise or pairwise deletion), single imputation methods (mean/median/mode substitution, regression imputation, LOCF), or model-based methods (maximum likelihood, multiple imputation). Understanding the reasons for missing data and the context of your dataset can help you decide on the most appropriate imputation technique.'