In [1]:
# ! pip install -U openai

In [1]:
import pandas as pd

In [10]:
df = pd.read_csv("top_1000_so_links.csv")

In [13]:
import requests
import time
from tqdm import tqdm
import os
import openai
import concurrent.futures

# Insert your openai key here
api_key = ''

def get_qa(prompt):
    try:
        completion = openai.ChatCompletion.create(
            model="gpt-3.5-turbo", api_key=api_key,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )

        qa = completion.choices[0].message.content

    except requests.exceptions.RequestException as e:
        print(f'Request failed with error: {str(e)}.')
    return qa

def get_qa_for_url(url):
    prompt = f"extract question and answer from the url - {url}"
    qa = get_qa(prompt)
    return qa


In [14]:

num_rows_start = 0
num_rows_end = 100

# Assuming df is defined elsewhere
urls = df["link"][num_rows_start:num_rows_end].tolist()

# Define the maximum number of concurrent requests to make
max_workers = 8

resp = []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit a job for each URL
    futures = [executor.submit(get_qa_for_url, url) for url in urls]

    # Iterate over the completed futures as they finish and add the results to resp
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
        resp.append(future.result())

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:21<00:00,  1.42s/it]


In [1]:
# resp

In [15]:
question_list = []
answer_list = []
for text in resp:
    split = text.split("Answer:")
    question = split[0].split("Question:")[-1]
    answer = split[-1]
    question_list.append(question)
    answer_list.append(answer)


In [16]:
out_df = pd.DataFrame({
    "question": question_list,
    "answer": answer_list
})

In [17]:
out_df

Unnamed: 0,question,answer
0,Why do we need to call zero_grad() in PyTorch?\n,"In PyTorch, backpropagation on a tensor that ..."
1,How do I save a trained model in PyTorch?\n\n,You can save a trained PyTorch model by using...
2,What's the difference between reshape and vie...,The reshape() and view() functions in PyTorch...
3,What does model.eval() do in PyTorch?\n,The `model.eval()` method sets the model to e...
4,What does view() do in PyTorch?\n,The view() method in PyTorch is used to resha...
...,...,...
95,CUDA error: CUBLAS_STATUS_ALLOC_FAILED when c...,This error indicates that there was not enoug...
96,How to display custom images in TensorBoard (...,You can display custom images in TensorBoard ...
97,"Issues installing PyTorch 1.4: ""No matching d...",This error message usually appears when you a...
98,Reshaping a tensor with padding in PyTorch\n\n,Here's one way to do it:\n\n```\nimport torch...


In [171]:
out_df.to_csv("data/qa-0to100.csv", index=False)

### Concat all data

In [172]:
csv_files = os.listdir("data")

In [173]:
list_of_dataframes = []

for file in csv_files:
    df = pd.read_csv("data/" + file)
    list_of_dataframes.append(df)

concatenated_df = pd.concat(list_of_dataframes, ignore_index=True)


In [174]:
concatenated_df.shape

(1001, 2)

In [175]:
concatenated_df.to_csv("concatenated_df.csv", index=False)

## concat sa and forum data

In [273]:
df1 = pd.read_csv('concatenated_df.csv')
df2 = pd.read_csv('pytorch_discuss_first_1000.csv')
df2 = df2.drop('Unnamed: 0', axis=1)

In [274]:
df_merged = pd.concat([df1, df2], axis=0)
df_merged = df_merged.reset_index(drop=True)
df_merged

Unnamed: 0,question,answer
0,\n\nPytorch Dataloader fails when the number o...,\n\nThis is expected behavior because when the...
1,"""Multi-dimensional inputs in PyTorch linear m...","""The PyTorch nn.Linear module expects a 2D in..."
2,Fixing the seed for torch.random.split()\n\n,"To fix the seed for torch.random.split(), you..."
3,What is the best way of tqdm for data loader?...,This question is subjective as there can be d...
4,How do I add some Gaussian noise to a tensor ...,There are multiple ways to add Gaussian noise...
...,...,...
1678,How can I reset the weights of a PyTorch model?,You can reset the weights of a PyTorch model b...
1679,How to change the weights of a PyTorch model?,There are a few ways to change the weights of ...
1680,There is no specific question mentioned in the...,There is no answer possible without a specific...
1681,Q: Can't call numpy on variable that requires ...,Q: Can't call numpy on variable that requires ...


## data cleaning

In [275]:
df_merged.isna().sum()

question    0
answer      0
dtype: int64

In [276]:
duplicates = df_merged[df_merged.duplicated()]
duplicates

Unnamed: 0,question,answer


In [277]:
# replace '\n\n' with '\n'
# remove first and last '\n'
# remove first and last space
# remove first and last '"'

df_merged['question'] = df_merged['question'].replace('\n\n', '\n', regex=True).str.strip(' ').str.strip('\n').str.strip('"')
df_merged['answer'] = df_merged['answer'].replace('\n\n', '\n', regex=True).str.strip(' ').str.strip('\n').str.strip('"')

In [278]:
# remove None yet and N/A in answer

df_merged = df_merged[df_merged['answer'] != 'None yet']
df_merged = df_merged[df_merged['answer'] != 'None yet.']
df_merged = df_merged[df_merged['answer'] != 'None']
df_merged = df_merged[df_merged['answer'] != 'N/A']

for index, row in df_merged.iterrows():
    if len(row['answer']) < 90:
        df_merged = df_merged.drop(index)

In [279]:
# remove rows contain same value in all column

row_counts = df_merged.nunique(axis=1)

# filter the dataframe to show rows with only one unique value
df_merged = df_merged[row_counts != 1]
df_merged = df_merged.reset_index(drop=True)
df_merged

Unnamed: 0,question,answer
0,Pytorch Dataloader fails when the number of ex...,This is expected behavior because when the num...
1,Multi-dimensional inputs in PyTorch linear met...,The PyTorch nn.Linear module expects a 2D inpu...
2,Fixing the seed for torch.random.split(),"To fix the seed for torch.random.split(), you ..."
3,What is the best way of tqdm for data loader?,This question is subjective as there can be di...
4,How do I add some Gaussian noise to a tensor i...,There are multiple ways to add Gaussian noise ...
...,...,...
1520,How to create batches of a list of varying dim...,This can be done using the `nn.utils.rnn.pad_s...
1521,How to extract probabilities?,There are a few different ways to extract prob...
1522,How can I reset the weights of a PyTorch model?,You can reset the weights of a PyTorch model b...
1523,How to change the weights of a PyTorch model?,There are a few ways to change the weights of ...


In [281]:
df_merged.to_csv("df_sa_forum.csv", index=False)