In [1]:

import pandas as pd
import re
from tqdm import tqdm
from langchain.vectorstores.faiss import FAISS
from langchain.text_splitter import CharacterTextSplitter


In [2]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

EMBED = "hf"
embeddings = HuggingFaceEmbeddings()


In [58]:

def cleanhtml(raw_html):
    CLEANR = re.compile('<.*?>')
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext
    
def clean_data(df):
    df["pt_answer"] = df["pt_answer"].apply(lambda x: cleanhtml(x))

    df["question"] = df["pt_title"].str.lower()
    df["answer"] = df["pt_answer"].str.lower()

    df = df[['pt_post_id','question', 'answer']]
    return df
    

## get qa and link to post
def get_url(df):
    url = []
    for index, row in df.iterrows():
        url.append(f"https://stackoverflow.com/questions/{row['pt_post_id']}/")
    
    df['source'] = url

    return df


In [59]:

df = pd.read_csv('/home/ubuntu/pt_question_answers_updated.csv')


In [60]:
df = clean_data(df)

In [61]:
df = get_url(df)

In [62]:
df = df[['question','answer','source']]

In [63]:
df

Unnamed: 0,question,answer,source
0,extracting the top-k value-indices from a 1-d ...,as of pull request #496 torch now includes a b...,https://stackoverflow.com/questions/34750268/
1,how to display custom images in tensorboard (e...,it is quite easy to do if you have the image i...,https://stackoverflow.com/questions/38543850/
2,python wheels: cp27mu not supported,this is exactly that. \nrecompile python under...,https://stackoverflow.com/questions/41767005/
3,loading torch7 trained models (.t7) in pytorch,as of pytorch 1.0 torch.utils.serialization is...,https://stackoverflow.com/questions/41861354/
4,pytorch: how to use dataloaders for custom dat...,"yes, that is possible. just create the objects...",https://stackoverflow.com/questions/41924453/
...,...,...,...
10758,is it possible to perform quantization on dens...,here's how to do this on densenet169 from torc...,https://stackoverflow.com/questions/74612146/
10759,"why when the batch size increased, the epoch t...","as you already noticed, there are many factors...",https://stackoverflow.com/questions/74637151/
10760,why does stablediffusionpipeline return black ...,apparently it is indeed an apple silicon (m1/m...,https://stackoverflow.com/questions/74642594/
10761,locating tags in a string in php (with respect...,i think i've got something. how about this:\nf...,https://stackoverflow.com/questions/74671399/


In [66]:
def split_pages(df):
    splitter = CharacterTextSplitter(separator="\n", chunk_size=2048)
    print('chunking pages into smaller sub-pages')
            
    pages = []

    for index, i in df.iterrows():
        texts = "QUESTION: " + i['question'] + "\nANSWER: " + i['answer']
        meta = {'source':i['source']}
        pages.extend(splitter.create_documents([texts], [meta]))
    pickle.dump(pages, open('so_pages.pkl', 'wb'))

In [67]:
split_pages(df)

Created a chunk of size 2710, which is longer than the specified 2048
Created a chunk of size 8837, which is longer than the specified 2048
Created a chunk of size 2105, which is longer than the specified 2048


chunking pages into smaller sub-pages


Created a chunk of size 2466, which is longer than the specified 2048
Created a chunk of size 2467, which is longer than the specified 2048


In [69]:
pages = pickle.load(open('so_pages.pkl', 'rb'))

docsearch = FAISS.from_documents([pages.pop(0)], embeddings)
i, step = 0, 50
while i<len(pages):
    if i%500==0:
        print(i,'pages done')
    texts = [d.page_content for d in pages[i:i+step]]
    meta = [d.metadata for d in pages[i:i+step]]
    docsearch.add_texts(texts, meta)
    i += step
print(len(pages),'pages done')

0 pages done
500 pages done
1000 pages done
1500 pages done
2000 pages done
2500 pages done
3000 pages done
3500 pages done
4000 pages done
4500 pages done
5000 pages done
5500 pages done
6000 pages done
6500 pages done
7000 pages done
7500 pages done
8000 pages done
8500 pages done
9000 pages done
9500 pages done
10000 pages done
10500 pages done
11000 pages done
11500 pages done
12000 pages done
12002 pages done


In [9]:
docsearch = FAISS.load_local("so_faiss_index", embeddings)

In [10]:
df

Unnamed: 0,question,answer,source
0,extracting the top-k value-indices from a 1-d ...,as of pull request #496 torch now includes a b...,https://stackoverflow.com/questions/34750268/
1,how to display custom images in tensorboard (e...,it is quite easy to do if you have the image i...,https://stackoverflow.com/questions/38543850/
2,python wheels: cp27mu not supported,this is exactly that. \nrecompile python under...,https://stackoverflow.com/questions/41767005/
3,loading torch7 trained models (.t7) in pytorch,as of pytorch 1.0 torch.utils.serialization is...,https://stackoverflow.com/questions/41861354/
4,pytorch: how to use dataloaders for custom dat...,"yes, that is possible. just create the objects...",https://stackoverflow.com/questions/41924453/
...,...,...,...
10758,is it possible to perform quantization on dens...,here's how to do this on densenet169 from torc...,https://stackoverflow.com/questions/74612146/
10759,"why when the batch size increased, the epoch t...","as you already noticed, there are many factors...",https://stackoverflow.com/questions/74637151/
10760,why does stablediffusionpipeline return black ...,apparently it is indeed an apple silicon (m1/m...,https://stackoverflow.com/questions/74642594/
10761,locating tags in a string in php (with respect...,i think i've got something. how about this:\nf...,https://stackoverflow.com/questions/74671399/


In [31]:
context = []

for index, i in tqdm(df.iterrows(),total=len(df)):
    docs = docsearch.similarity_search_with_score(i['question'], k = 2)
    ans = []
    for doc in docs:
        text = doc[0].page_content.split('ANSWER:')[-1].lstrip()
        ans.append("Answer: " + doc[0].page_content.split('ANSWER:')[-1] + "\nmetadata = " + str(doc[0].metadata))
    for item in ans:
        if i['answer'][:20] in item:
            pass
        else:
            ans[0] = 'Answer: ' + i['answer'] + "\nmetadata = {'source': " + str(i['source']) + '}'
    context.append(', '.join(ans))

100%|█████████████████████████████████████| 10763/10763 [03:38<00:00, 49.35it/s]


In [64]:
df['context'] = context

In [65]:
df

Unnamed: 0,question,answer,source,context
0,extracting the top-k value-indices from a 1-d ...,as of pull request #496 torch now includes a b...,https://stackoverflow.com/questions/34750268/,Answer: as of pull request #496 torch now incl...
1,how to display custom images in tensorboard (e...,it is quite easy to do if you have the image i...,https://stackoverflow.com/questions/38543850/,Answer: it is quite easy to do if you have the...
2,python wheels: cp27mu not supported,this is exactly that. \nrecompile python under...,https://stackoverflow.com/questions/41767005/,Answer: this is exactly that. \nrecompile pyth...
3,loading torch7 trained models (.t7) in pytorch,as of pytorch 1.0 torch.utils.serialization is...,https://stackoverflow.com/questions/41861354/,Answer: as of pytorch 1.0 torch.utils.serializ...
4,pytorch: how to use dataloaders for custom dat...,"yes, that is possible. just create the objects...",https://stackoverflow.com/questions/41924453/,"Answer: yes, that is possible. just create the..."
...,...,...,...,...
10758,is it possible to perform quantization on dens...,here's how to do this on densenet169 from torc...,https://stackoverflow.com/questions/74612146/,Answer: here's how to do this on densenet169 f...
10759,"why when the batch size increased, the epoch t...","as you already noticed, there are many factors...",https://stackoverflow.com/questions/74637151/,"Answer: as you already noticed, there are many..."
10760,why does stablediffusionpipeline return black ...,apparently it is indeed an apple silicon (m1/m...,https://stackoverflow.com/questions/74642594/,Answer: apparently it is indeed an apple silic...
10761,locating tags in a string in php (with respect...,i think i've got something. how about this:\nf...,https://stackoverflow.com/questions/74671399/,Answer: i think i've got something. how about ...


In [66]:
df['answer_with_source'] = df['answer'] + "\nmetadata = {'source': " + df['source'] + "}"

In [67]:
df

Unnamed: 0,question,answer,source,context,answer_with_source
0,extracting the top-k value-indices from a 1-d ...,as of pull request #496 torch now includes a b...,https://stackoverflow.com/questions/34750268/,Answer: as of pull request #496 torch now incl...,as of pull request #496 torch now includes a b...
1,how to display custom images in tensorboard (e...,it is quite easy to do if you have the image i...,https://stackoverflow.com/questions/38543850/,Answer: it is quite easy to do if you have the...,it is quite easy to do if you have the image i...
2,python wheels: cp27mu not supported,this is exactly that. \nrecompile python under...,https://stackoverflow.com/questions/41767005/,Answer: this is exactly that. \nrecompile pyth...,this is exactly that. \nrecompile python under...
3,loading torch7 trained models (.t7) in pytorch,as of pytorch 1.0 torch.utils.serialization is...,https://stackoverflow.com/questions/41861354/,Answer: as of pytorch 1.0 torch.utils.serializ...,as of pytorch 1.0 torch.utils.serialization is...
4,pytorch: how to use dataloaders for custom dat...,"yes, that is possible. just create the objects...",https://stackoverflow.com/questions/41924453/,"Answer: yes, that is possible. just create the...","yes, that is possible. just create the objects..."
...,...,...,...,...,...
10758,is it possible to perform quantization on dens...,here's how to do this on densenet169 from torc...,https://stackoverflow.com/questions/74612146/,Answer: here's how to do this on densenet169 f...,here's how to do this on densenet169 from torc...
10759,"why when the batch size increased, the epoch t...","as you already noticed, there are many factors...",https://stackoverflow.com/questions/74637151/,"Answer: as you already noticed, there are many...","as you already noticed, there are many factors..."
10760,why does stablediffusionpipeline return black ...,apparently it is indeed an apple silicon (m1/m...,https://stackoverflow.com/questions/74642594/,Answer: apparently it is indeed an apple silic...,apparently it is indeed an apple silicon (m1/m...
10761,locating tags in a string in php (with respect...,i think i've got something. how about this:\nf...,https://stackoverflow.com/questions/74671399/,Answer: i think i've got something. how about ...,i think i've got something. how about this:\nf...


In [68]:
df.to_csv('so_data_with_context.csv')

In [70]:

import json
data = []

for index, row in df.iterrows():
    d = { "instruction": row['question'][0:2048],
          "input": row['context'][0:1024],
          "output": row['answer_with_source'][0:2048]
        }
    
    data.append(d)
    

with open('so_data_with_context.json', 'w') as f:
    json.dump(data, f)