In [1]:
import pandas as pd
import re
import concurrent.futures
import requests
import openai
from tqdm import tqdm
from langchain.vectorstores.faiss import FAISS
from langchain.text_splitter import CharacterTextSplitter

In [2]:
OPENAI_API_KEY = ''

In [3]:
## clean and process so

def cleanhtml(raw_html):
    CLEANR = re.compile('<.*?>')
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext
    
def clean_data(df):
    df["pt_answer"] = df["pt_answer"].apply(lambda x: cleanhtml(x))

    df["question"] = df["pt_title"].str.lower() + "\n" + df["pt_body"]
    df["question"] = df["question"].apply(lambda x: cleanhtml(x))
    df["answer"] = df["pt_answer"].str.lower()

    df = df[['pt_post_id','question', 'answer']]
    return df
    

## get qa and link to post
def get_url(df):
    url = []
    for index, row in df.iterrows():
        url.append(f"https://stackoverflow.com/questions/{row['pt_post_id']}/")
    
    df['source'] = url

    return df


In [4]:

df = pd.read_csv('pt_question_answers_updated.csv')
df.shape

(10763, 12)

In [5]:
df = clean_data(df)
df = get_url(df)
df = df[['question','answer','source']]

In [8]:
df = df[:5000]

df.shape

(5000, 3)

## using openai summarize accepted answer

In [9]:
def get_qa_openai(context, index):
    try:
        completion = openai.ChatCompletion.create(
                model="gpt-3.5-turbo", api_key = OPENAI_API_KEY,
                messages=[
                    {"role": "user", "content": context}
                  ]
                )

        qa = completion.choices[0].message.content

    except requests.exceptions.RequestException as e:
        print(f'Request failed with error: {str(e)}.')
        print(f'Waiting for 3 minutes before trying again...')
        time.sleep(180)
    
    return (qa, index)

In [10]:
questions_ans = []

with concurrent.futures.ThreadPoolExecutor() as executor:
    
    futures = []
    for index,i in df.iterrows():

        context = f"Given a pytorch question and answer given below, How will an expert PyTorch engineer answer this question? Include code as appropriate and do not mentioned your role in the answer \
                    question: {i['question']}, answer:{i['answer']}"
        
        futures.append(executor.submit(get_qa_openai, context, index))

    for future, (_, row) in tqdm(zip(concurrent.futures.as_completed(futures), df.iterrows()), total=len(df)):
        try:
            qa, ind = future.result()
            questions_ans.append((ind,qa))
        except Exception as exc:
            print(f'generated an exception: {exc}')

  1%|██▎                                                                                                                                                                  | 71/5000 [00:50<28:41,  2.86it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4728 tokens. Please reduce the length of the messages.


 17%|███████████████████████████▌                                                                                                                                        | 839/5000 [10:25<57:44,  1.20it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 30eae6c29b6ef042f1d954015dd11a2f in your message.)


 18%|█████████████████████████████▉                                                                                                                                      | 914/5000 [11:18<21:56,  3.10it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4112 tokens. Please reduce the length of the messages.


 21%|█████████████████████████████████▋                                                                                                                                 | 1032/5000 [12:41<19:42,  3.36it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 5185 tokens. Please reduce the length of the messages.


 21%|█████████████████████████████████▊                                                                                                                                 | 1039/5000 [12:44<22:04,  2.99it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4320 tokens. Please reduce the length of the messages.


 23%|█████████████████████████████████████▏                                                                                                                             | 1141/5000 [13:58<37:40,  1.71it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 902652e2bc423abc323234985d055bac in your message.)


 23%|█████████████████████████████████████▎                                                                                                                           | 1157/5000 [14:13<1:08:00,  1.06s/it]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 3ed6807e81774dc34ec60b708bf3cd35 in your message.)


 24%|██████████████████████████████████████▊                                                                                                                            | 1190/5000 [14:38<37:25,  1.70it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 4fc427cd0fec50ff52a9f482f85a40bb in your message.)


 24%|███████████████████████████████████████▎                                                                                                                           | 1207/5000 [14:48<33:45,  1.87it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4450 tokens. Please reduce the length of the messages.


 31%|██████████████████████████████████████████████████▌                                                                                                                | 1552/5000 [19:11<21:27,  2.68it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 7319 tokens. Please reduce the length of the messages.


 35%|█████████████████████████████████████████████████████████▍                                                                                                         | 1761/5000 [21:39<39:35,  1.36it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 7088 tokens. Please reduce the length of the messages.


 37%|████████████████████████████████████████████████████████████▌                                                                                                      | 1857/5000 [22:55<20:28,  2.56it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4474 tokens. Please reduce the length of the messages.


 39%|██████████████████████████████████████████████████████████████▊                                                                                                    | 1928/5000 [23:49<33:49,  1.51it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID b12f7dd20645d5ba56af9630e1aaf80d in your message.)


 40%|████████████████████████████████████████████████████████████████▊                                                                                                | 2011/5000 [24:58<1:05:05,  1.31s/it]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 18abe07195fd4aa2947087d0a54ba017 in your message.)


 42%|███████████████████████████████████████████████████████████████████▉                                                                                               | 2083/5000 [25:55<19:59,  2.43it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 5134 tokens. Please reduce the length of the messages.


 42%|████████████████████████████████████████████████████████████████████▍                                                                                              | 2098/5000 [26:07<26:44,  1.81it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4299 tokens. Please reduce the length of the messages.


 43%|██████████████████████████████████████████████████████████████████████▌                                                                                            | 2163/5000 [26:54<23:36,  2.00it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 5475 tokens. Please reduce the length of the messages.


 47%|████████████████████████████████████████████████████████████████████████████                                                                                       | 2333/5000 [29:07<19:24,  2.29it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 9153 tokens. Please reduce the length of the messages.


 48%|██████████████████████████████████████████████████████████████████████████████▏                                                                                    | 2397/5000 [29:56<13:58,  3.11it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 5342 tokens. Please reduce the length of the messages.


 48%|██████████████████████████████████████████████████████████████████████████████▎                                                                                    | 2401/5000 [30:00<33:14,  1.30it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 38349496a0fbbbe70d68c3c37932761f in your message.)


 52%|█████████████████████████████████████████████████████████████████████████████████████▏                                                                             | 2615/5000 [32:35<34:53,  1.14it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 5132 tokens. Please reduce the length of the messages.


 53%|██████████████████████████████████████████████████████████████████████████████████████▍                                                                            | 2652/5000 [33:01<25:02,  1.56it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4105 tokens. Please reduce the length of the messages.


 55%|█████████████████████████████████████████████████████████████████████████████████████████▎                                                                         | 2740/5000 [34:15<24:48,  1.52it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 5127 tokens. Please reduce the length of the messages.


 57%|████████████████████████████████████████████████████████████████████████████████████████████▋                                                                      | 2842/5000 [35:38<18:57,  1.90it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4944 tokens. Please reduce the length of the messages.


 58%|█████████████████████████████████████████████████████████████████████████████████████████████▉                                                                     | 2882/5000 [36:09<14:10,  2.49it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 5889 tokens. Please reduce the length of the messages.


 59%|███████████████████████████████████████████████████████████████████████████████████████████████▌                                                                   | 2931/5000 [36:51<07:49,  4.41it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4609 tokens. Please reduce the length of the messages.


 59%|████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                  | 2958/5000 [37:08<09:40,  3.52it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 5344 tokens. Please reduce the length of the messages.


 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                             | 3108/5000 [39:08<16:29,  1.91it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4827 tokens. Please reduce the length of the messages.


 65%|██████████████████████████████████████████████████████████████████████████████████████████████████████████                                                         | 3255/5000 [41:07<31:49,  1.09s/it]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 5006 tokens. Please reduce the length of the messages.


 65%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                        | 3267/5000 [41:17<18:09,  1.59it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 5763 tokens. Please reduce the length of the messages.


 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                       | 3311/5000 [41:48<10:31,  2.67it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 4fb86db03d54dce0f4fd85d6458a9372 in your message.)


 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                  | 3450/5000 [43:45<10:01,  2.58it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 8480 tokens. Please reduce the length of the messages.


 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                 | 3473/5000 [44:03<22:19,  1.14it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 11498 tokens. Please reduce the length of the messages.


 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                | 3521/5000 [44:38<09:28,  2.60it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4543 tokens. Please reduce the length of the messages.
generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4448 tokens. Please reduce the length of the messages.


 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 3537/5000 [44:50<12:40,  1.92it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4731 tokens. Please reduce the length of the messages.


 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                           | 3656/5000 [46:18<19:19,  1.16it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4204 tokens. Please reduce the length of the messages.


 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                           | 3665/5000 [46:25<15:26,  1.44it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 0fb5d3103dc2bb53e79a0ffb99490db3 in your message.)


 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 3691/5000 [46:46<14:26,  1.51it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 6000 tokens. Please reduce the length of the messages.


 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                        | 3747/5000 [47:33<18:06,  1.15it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID b28d856566637b3d2acea6f4019c9747 in your message.)


 76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                       | 3779/5000 [47:59<11:13,  1.81it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 13301 tokens. Please reduce the length of the messages.


 79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                   | 3929/5000 [49:46<11:14,  1.59it/s]

generated an exception: HTTP code 502 from API (<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>cloudflare</center>
</body>
</html>
)


 79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                  | 3935/5000 [49:49<07:46,  2.28it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 5507 tokens. Please reduce the length of the messages.


 79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 3943/5000 [49:55<11:41,  1.51it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4151 tokens. Please reduce the length of the messages.


 79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                  | 3949/5000 [50:01<15:45,  1.11it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 8978 tokens. Please reduce the length of the messages.


 79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                  | 3954/5000 [50:05<10:36,  1.64it/s]

generated an exception: HTTP code 502 from API (<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>cloudflare</center>
</body>
</html>
)


 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                            | 4135/5000 [52:37<09:23,  1.54it/s]

generated an exception: HTTP code 502 from API (<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>cloudflare</center>
</body>
</html>
)


 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                           | 4162/5000 [52:52<05:03,  2.77it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4124 tokens. Please reduce the length of the messages.


 84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 4175/5000 [53:04<09:08,  1.50it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4174 tokens. Please reduce the length of the messages.


 86%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                       | 4284/5000 [54:27<07:13,  1.65it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 5287 tokens. Please reduce the length of the messages.


 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                     | 4335/5000 [55:05<05:15,  2.11it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4487 tokens. Please reduce the length of the messages.


 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋              | 4562/5000 [57:50<02:01,  3.61it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 7159 tokens. Please reduce the length of the messages.


 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 4644/5000 [58:55<04:53,  1.21it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4497 tokens. Please reduce the length of the messages.


 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 4714/5000 [59:53<05:31,  1.16s/it]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 9785098eec624cf61051ce4e64ecbf30 in your message.)


 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 4717/5000 [59:53<02:26,  1.94it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 6869 tokens. Please reduce the length of the messages.


 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉         | 4722/5000 [59:55<01:19,  3.51it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 10830 tokens. Please reduce the length of the messages.


 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 4804/5000 [1:01:00<03:11,  1.02it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4647 tokens. Please reduce the length of the messages.


 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 4847/5000 [1:01:36<01:25,  1.79it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID b872cab101ebfd425b86960b7a2ce36b in your message.)


 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 4878/5000 [1:02:03<01:01,  1.97it/s]

generated an exception: This model's maximum context length is 4097 tokens. However, your messages resulted in 4360 tokens. Please reduce the length of the messages.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [1:05:08<00:00,  1.28it/s]

generated an exception: Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Thu, 18 May 2023 03:38:59 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7c9109fad996c561-SEA', 'alt-svc': 'h3=":443"; ma=86400, h3-29=":443"; ma=86400'}





In [11]:
for index, qa in questions_ans:
    df.at[index, 'short_answer'] = qa

In [12]:
df

Unnamed: 0,question,answer,source,short_answer
0,extracting the top-k value-indices from a 1-d ...,as of pull request #496 torch now includes a b...,https://stackoverflow.com/questions/34750268/,An expert PyTorch engineer would suggest using...
1,how to display custom images in tensorboard (e...,it is quite easy to do if you have the image i...,https://stackoverflow.com/questions/38543850/,"As an expert PyTorch engineer, I can suggest a..."
2,python wheels: cp27mu not supported\nI'm tryin...,this is exactly that. \nrecompile python under...,https://stackoverflow.com/questions/41767005/,Code to recompile Python with --enable-unicode...
3,loading torch7 trained models (.t7) in pytorch...,as of pytorch 1.0 torch.utils.serialization is...,https://stackoverflow.com/questions/41861354/,An expert PyTorch engineer would first acknowl...
4,pytorch: how to use dataloaders for custom dat...,"yes, that is possible. just create the objects...",https://stackoverflow.com/questions/41924453/,An expert PyTorch engineer would answer by pro...
...,...,...,...,...
4995,compare two segmentation maps predictions\nI a...,"yes, this is a valid way to implement consiste...",https://stackoverflow.com/questions/67682106/,Regarding the computation of the total_supervi...
4996,difference between dataset and tensordataset i...,the dataset class is an abstract class that is...,https://stackoverflow.com/questions/67683406/,An expert PyTorch engineer would explain that ...
4997,define nn.parameters with a for loop\nI am int...,best way to accomplish this you can accomplish...,https://stackoverflow.com/questions/67689104/,An expert PyTorch engineer would suggest using...
4998,how to integrate a pytorch model into a dynami...,tensorflow or pytorch models can't be directly...,https://stackoverflow.com/questions/67693181/,"As of now, direct integration of PyTorch model..."


In [13]:
df.to_csv('so_5k_with_short_answer.csv')

## data cleaning

In [14]:
df.isna().sum()

question         0
answer           0
source           0
short_answer    60
dtype: int64

In [15]:
df = df.dropna()

In [16]:
df

Unnamed: 0,question,answer,source,short_answer
0,extracting the top-k value-indices from a 1-d ...,as of pull request #496 torch now includes a b...,https://stackoverflow.com/questions/34750268/,An expert PyTorch engineer would suggest using...
1,how to display custom images in tensorboard (e...,it is quite easy to do if you have the image i...,https://stackoverflow.com/questions/38543850/,"As an expert PyTorch engineer, I can suggest a..."
2,python wheels: cp27mu not supported\nI'm tryin...,this is exactly that. \nrecompile python under...,https://stackoverflow.com/questions/41767005/,Code to recompile Python with --enable-unicode...
3,loading torch7 trained models (.t7) in pytorch...,as of pytorch 1.0 torch.utils.serialization is...,https://stackoverflow.com/questions/41861354/,An expert PyTorch engineer would first acknowl...
4,pytorch: how to use dataloaders for custom dat...,"yes, that is possible. just create the objects...",https://stackoverflow.com/questions/41924453/,An expert PyTorch engineer would answer by pro...
...,...,...,...,...
4995,compare two segmentation maps predictions\nI a...,"yes, this is a valid way to implement consiste...",https://stackoverflow.com/questions/67682106/,Regarding the computation of the total_supervi...
4996,difference between dataset and tensordataset i...,the dataset class is an abstract class that is...,https://stackoverflow.com/questions/67683406/,An expert PyTorch engineer would explain that ...
4997,define nn.parameters with a for loop\nI am int...,best way to accomplish this you can accomplish...,https://stackoverflow.com/questions/67689104/,An expert PyTorch engineer would suggest using...
4998,how to integrate a pytorch model into a dynami...,tensorflow or pytorch models can't be directly...,https://stackoverflow.com/questions/67693181/,"As of now, direct integration of PyTorch model..."


In [18]:
df['q_len'] = df['question'].str.len()
df['c_len'] = df['answer'].str.len()
df['a_len'] = df['short_answer'].str.len()
df["total"] = df['q_len'] + df['c_len'] + df['a_len']

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['q_len'] = df['question'].str.len()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['c_len'] = df['answer'].str.len()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['a_len'] = df['short_answer'].str.len()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

Unnamed: 0,question,answer,source,short_answer,q_len,c_len,a_len,total
0,extracting the top-k value-indices from a 1-d ...,as of pull request #496 torch now includes a b...,https://stackoverflow.com/questions/34750268/,An expert PyTorch engineer would suggest using...,359,788,473,1620
1,how to display custom images in tensorboard (e...,it is quite easy to do if you have the image i...,https://stackoverflow.com/questions/38543850/,"As an expert PyTorch engineer, I can suggest a...",599,1020,1095,2714
2,python wheels: cp27mu not supported\nI'm tryin...,this is exactly that. \nrecompile python under...,https://stackoverflow.com/questions/41767005/,Code to recompile Python with --enable-unicode...,2110,113,687,2910
3,loading torch7 trained models (.t7) in pytorch...,as of pytorch 1.0 torch.utils.serialization is...,https://stackoverflow.com/questions/41861354/,An expert PyTorch engineer would first acknowl...,676,516,1112,2304
4,pytorch: how to use dataloaders for custom dat...,"yes, that is possible. just create the objects...",https://stackoverflow.com/questions/41924453/,An expert PyTorch engineer would answer by pro...,298,1443,2517,4258
...,...,...,...,...,...,...,...,...
4995,compare two segmentation maps predictions\nI a...,"yes, this is a valid way to implement consiste...",https://stackoverflow.com/questions/67682106/,Regarding the computation of the total_supervi...,2830,421,194,3445
4996,difference between dataset and tensordataset i...,the dataset class is an abstract class that is...,https://stackoverflow.com/questions/67683406/,An expert PyTorch engineer would explain that ...,251,854,600,1705
4997,define nn.parameters with a for loop\nI am int...,best way to accomplish this you can accomplish...,https://stackoverflow.com/questions/67689104/,An expert PyTorch engineer would suggest using...,1300,723,1019,3042
4998,how to integrate a pytorch model into a dynami...,tensorflow or pytorch models can't be directly...,https://stackoverflow.com/questions/67693181/,"As of now, direct integration of PyTorch model...",1410,914,930,3254


In [34]:
## checking sample data

for i in range(5):
    print("*" * 100)
    print("!!!! Question: ", df.loc[i, "question"])
    print("!!!! Context: ", df.loc[i, "answer"])
    print("!!!! Answer: ", df.loc[i, "short_answer"])


****************************************************************************************************
!!!! Question:  extracting the top-k value-indices from a 1-d tensor
Given a 1-D tensor in Torch (torch.Tensor), containing values which can be compared (say floating point), how can we extract the indices of the top-k values in that tensor?
Apart from the brute-force method, I am looking for some API call, that Torch/lua provides, which can perform this task efficiently.

!!!! Context:  as of pull request #496 torch now includes a built-in api named torch.topk. example:

&gt; t = torch.tensor{9, 1, 8, 2, 7, 3, 6, 4, 5}

-- obtain the 3 smallest elements
&gt; res = t:topk(3)
&gt; print(res)
 1
 2
 3
[torch.doubletensor of size 3]

-- you can also get the indices in addition
&gt; res, ind = t:topk(3)
&gt; print(ind)
 2
 4
 6
[torch.longtensor of size 3]

-- alternatively you can obtain the k largest elements as follow
-- (see the api documentation for more details)
&gt; res = t:topk(3, t

## preparing alpaca lora dataset format

In [29]:

import json
data = []

for index, row in df.iterrows():
    d = { "instruction": row['question'],
          "input": row['answer'],
          "output": row['short_answer']
        }
    
    data.append(d)
    

with open('so_5k_with_short_answer.json', 'w') as f:
    json.dump(data, f)