In [1]:
!pip install requests beautifulsoup4 scrapy pandas tqdm openai nltk spacy
!python -m spacy download en_core_web_sm


Collecting openai
  Downloading openai-1.57.1-py3-none-any.whl.metadata (24 kB)
Collecting spacy
  Downloading spacy-3.8.2-cp311-cp311-win_amd64.whl.metadata (27 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.8.2-cp311-cp311-win_amd64.whl.metadata (5.3 kB)
Collecting typing-extensions<5,>=4.11 (from openai)
  Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.11-cp311-cp311-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.10-cp311-cp311-win_amd64.whl.metadata (8.6 kB)
Collec

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.30.0 requires protobuf<5,>=3.20, but you have protobuf 5.29.1 which is incompatible.


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 330.3 kB/s eta 0:00:39
     --------------------------------------- 0.1/12.8 MB 469.7 kB/s eta 0:00:28
     --------------------------------------- 0.1/12.8 MB 731.4 kB/s eta 0:00:18
      --------------------------------------- 0.2/12.8 MB 1.0 MB/s eta 0:00:13
     - -------------------------------------- 0.3/12.8 MB 1.5 MB/s eta 0:00:09
     - -------------------------------------- 0.5/12.8 MB 2.0 MB/s eta 0:00:07
     -- ------------------------------------- 0.9/12.8 MB 2.9 MB/s eta 0:00:05
     ---- ----------------------------------- 1.3/12

In [2]:
import requests
import pandas as pd
from tqdm import tqdm

In [None]:
def fetch_wikipedia_articles(category, limit=1000):
    S = requests.Session()
    URL = "https://en.wikipedia.org/w/api.php"

    PARAMS = {
        "action": "query",
        "list": "categorymembers",
        "cmtitle": f"Category:{category}",
        "cmlimit": 500,
        "format": "json"
    }

    articles = []
    while True:
        response = S.get(url=URL, params=PARAMS)
        data = response.json()
        members = data.get('query', {}).get('categorymembers', [])
        for member in members:
            if member['title'].startswith("Category:"):
                continue  # Skip subcategories
            articles.append({
                "pid": member['pageid'],
                "title": member['title']
            })
            if len(articles) >= limit:
                break
        if 'continue' in data and len(articles) < limit:
            PARAMS.update(data['continue'])
        else:
            break
        if len(articles) >= limit:
            break

    # Fetch content for each article
    collection = []
    for article in tqdm(articles, desc=f"Fetching articles on {category}"):
        page_id = article['pid']
        title = article['title']
        content = fetch_article_content(page_id)
        if content:
            collection.append({
                "pid": page_id,
                "title": title,
                "passage_text": content
            })
    return collection

def fetch_article_content(page_id):
    S = requests.Session()
    URL = "https://en.wikipedia.org/w/api.php"

    PARAMS = {
        "action": "query",
        "pageids": page_id,
        "prop": "extracts",
        "explaintext": True,
        "format": "json"
    }

    response = S.get(url=URL, params=PARAMS)
    data = response.json()
    pages = data.get('query', {}).get('pages', {})
    page = pages.get(str(page_id), {})
    extract = page.get('extract', "")
    return extract

if __name__ == "__main__":
    categories = ["Neuroscience", "Renaissance", "Deep_learning", "Artificial_intelligence", "Machine_learning"]
    collection = [] 
    for category in categories:
        collection = collection + fetch_wikipedia_articles(category, limit=1000)
    df_collection = pd.DataFrame(collection)
    df_collection.to_csv("collection.tsv", sep="\t", index=False, header=False)
    print("Đã lưu collection.tsv")


Fetching articles on Neuroscience: 100%|██████████| 166/166 [02:10<00:00,  1.27it/s]
Fetching articles on Renaissance: 100%|██████████| 34/34 [00:30<00:00,  1.13it/s]
Fetching articles on Deep_learning: 100%|██████████| 48/48 [00:40<00:00,  1.18it/s]
Fetching articles on Artificial_intelligence: 100%|██████████| 174/174 [02:12<00:00,  1.31it/s]
Fetching articles on Machine_learning: 100%|██████████| 240/240 [03:10<00:00,  1.26it/s]

Đã lưu collection.tsv





In [24]:
df = pd.read_csv('collection.tsv', sep='\t', encoding='utf-8', header=None)
df.columns = ['pid', 'title', 'content']
df2 = pd.read_csv('collection1.tsv', sep='\t', encoding='utf-8', header=None)
df2.columns = ['pid', 'title', 'content']
final_df = pd.concat([df, df2]).reset_index(drop=True).drop_duplicates()
final_df.to_csv('final_collection.tsv', sep='\t')

In [22]:
final_df

Unnamed: 0,pid,title,content
0,61541925,History of artificial neural networks,Artificial neural networks (ANNs) are models c...
1,61547718,Mathematics of artificial neural networks,An artificial neural network (ANN) combines bi...
2,21523,Neural network (machine learning),"In machine learning, a neural network (also ar..."
3,78113392,A logical calculus of the ideas immanent in ne...,A logical calculus of the ideas immanent to ne...
4,65434605,Ablation (artificial intelligence),"In artificial intelligence (AI), particularly ..."
...,...,...,...
4193,1634778,Rough set,"In computer science, a rough set, first descri..."
4194,71867310,Safety and liveness properties,Properties of an execution of a computer progr...
4195,2058995,Scientific community metaphor,"In computer science, the scientific community ..."
4196,77060756,Selman's theorem,"In computability theory, Selman's theorem is a..."


In [39]:
import google.generativeai as genai
import ast

def generate_queries(passage, model_name):
    prompt = f"Generate 5 diverse and meaningful questions based on the following passage:\n\n{passage}\n\n"
    constraint = """ Each question should be no longer than 30 words.
    Return the answer in JSON form
    {'question': str}
    Return list[Question]"""
    final_prompt = prompt + constraint
    model = genai.GenerativeModel(model_name)
    try:
        response = model.generate_content(final_prompt).text
        return response[8:-4]
    except Exception as e:
        print(f"Error generating queries: {e}")
        return []

In [None]:
generate_queries(final_df['content'][5])

In [41]:
import pandas as pd
from tqdm import tqdm
import time

def create_queries(df_collection, model_name):
    queries = []
    for _, row in tqdm(df_collection.iterrows(), total=df_collection.shape[0], desc="Generating queries"):
        passage_text = row['content']
        pid = row['pid']
        generated_questions = generate_queries(passage_text, model_name)
        queries.append({
            "pid": pid,
            "query_text": generated_questions
        })
        time.sleep(3.5)
    return queries

if __name__ == "__main__":
    df_collection = pd.read_csv("final_collection.tsv", sep="\t", index_col=0)
    genai.configure(api_key="AIzaSyD7AzLERO-19fgspE348JCDQVdqoLkowks")
    first_queries = create_queries(df_collection[:1000], 'gemini-1.5-flash-8b')
    second_queries = create_queries(df_collection[1000:2000], 'gemini-1.5-flash')
    queries_2k = first_queries + second_queries
    pd.DataFrame(queries_2k).to_csv('queries.tsv', sep='\t')

Generating queries:  49%|████▉     | 489/1000 [37:49<39:44,  4.67s/it]  

Error generating queries: 429 Resource has been exhausted (e.g. check quota).


Generating queries:  53%|█████▎    | 528/1000 [40:50<35:56,  4.57s/it]

Error generating queries: 429 Resource has been exhausted (e.g. check quota).


Generating queries: 100%|██████████| 1000/1000 [1:17:06<00:00,  4.63s/it]
Generating queries: 100%|██████████| 1000/1000 [1:26:06<00:00,  5.17s/it]


In [None]:
quest_df = pd.DataFrame(queries_2k)

'[\n  {\n    "question": "How did advances in hardware influence the resurgence of ANNs in the 2000s?"\n  },\n  {\n    "question": "What is the significance of the \'AI winter\' period in the development of ANNs?"\n  },\n  {\n    "question": "What early mathematical models inspired the development of ANNs?"\n  },\n  {\n    "question": "How did the perceptron, a fundamental ANN, differ from later ANN architectures?"\n  },\n  {\n    "question": "What is the relationship between biological neural networks and artificial neural networks?"\n  }\n]\n'

In [53]:
import json

def extractQuest(file_path):
    quest_df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
    list_quest = []
    for sample in quest_df['query_text']:
        try:
            list_quest = list_quest + json.loads(sample)
        except Exception as e:
            continue
    return pd.DataFrame(list_quest)

In [56]:
list_quest = extractQuest('queries.tsv')
list_quest
list_quest.to_csv('final_queries.tsv', sep='\t', encoding='utf-8')

In [67]:
df_queries = pd.read_csv("final_collection.tsv", sep="\t", names=['pid', 'title', 'content'], header=0)
df_queries

Unnamed: 0,pid,title,content
0,61541925,History of artificial neural networks,Artificial neural networks (ANNs) are models c...
1,61547718,Mathematics of artificial neural networks,An artificial neural network (ANN) combines bi...
2,21523,Neural network (machine learning),"In machine learning, a neural network (also ar..."
3,78113392,A logical calculus of the ideas immanent in ne...,A logical calculus of the ideas immanent to ne...
4,65434605,Ablation (artificial intelligence),"In artificial intelligence (AI), particularly ..."
...,...,...,...
4193,1634778,Rough set,"In computer science, a rough set, first descri..."
4194,71867310,Safety and liveness properties,Properties of an execution of a computer progr...
4195,2058995,Scientific community metaphor,"In computer science, the scientific community ..."
4196,77060756,Selman's theorem,"In computability theory, Selman's theorem is a..."


In [70]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.5 kB ? eta -:--:--
     ----------------- -------------------- 20.5/43.5 kB 682.7 kB/s eta 0:00:01
     ----------------------------------- -- 41.0/43.5 kB 495.5 kB/s eta 0:00:01
     -------------------------------------- 43.5/43.5 kB 426.9 kB/s eta 0:00:00
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.5.1-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting sympy==1.13.1 (from torch>=1.11.0->sentence-transformers)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.26.5-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5

In [75]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
from tqdm import tqdm

model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_embeddings(texts):
    return model.encode(texts, convert_to_numpy=True)

def create_qrels_fast(df_queries, df_collection, top_k=5):
    query_embeddings = calculate_embeddings(df_queries['query_text'].tolist())
    print('Done q_e')
    passage_embeddings = calculate_embeddings(df_collection['passage_text'].tolist())
    print('Done p_e')
    similarities = cosine_similarity(query_embeddings, passage_embeddings)

    qrels = []
    for qid, row in tqdm(df_queries.iterrows(), total=df_queries.shape[0], desc="Creating qrels"):
        top_indices = np.argsort(-similarities[qid])[:top_k]
        for pid in top_indices:
            qrels.append({
                "qid": df_queries.iloc[qid]['qid'],
                "pid": df_collection.iloc[pid]['pid']
            })
    return pd.DataFrame(qrels)

if __name__ == "__main__":
    # Đọc dữ liệu
    df_queries = pd.read_csv("final_queries.tsv", sep="\t", header=0, names=["qid", "query_text"])
    df_collection = pd.read_csv("final_collection.tsv", sep="\t", header=0, names=["pid", "title", "passage_text"])

    # Tạo qrels nhanh hơn
    df_qrels = create_qrels_fast(df_queries, df_collection, top_k=5)
    df_qrels.to_csv("qrels.tsv", sep="\t", index=False, header=False)
    print("Đã lưu qrels.tsv")



Done q_e
Done p_e


Creating qrels: 100%|██████████| 9975/9975 [00:06<00:00, 1596.76it/s]


Đã lưu qrels.tsv


In [89]:
df_qrels

Unnamed: 0,qid,pid
0,0,61541925
1,0,55569888
2,0,40218456
3,0,32692970
4,0,39773873
...,...,...
49870,9974,72624020
49871,9974,69759774
49872,9974,66104473
49873,9974,75731161


In [118]:
df_queries[df_queries['qid'] == df_qrels['qid'][5]]

Unnamed: 0,qid,query_text
1,1,What is the significance of the 'AI winter' pe...


In [119]:
print(
    df_queries[df_queries['qid'] == df_qrels['qid'][3]]['query_text'][0]
)
print(
    df_collection[df_collection['pid'] == df_qrels['pid'][3]]['title'],
)

How did advances in hardware influence the resurgence of ANNs in the 2000s?
Series([], Name: title, dtype: object)


In [122]:
import random

def create_triples(df_qrels, df_collection, num_negatives=1):
    triples = []
    pid_set = set(df_collection['pid'].tolist())
    for _, row in tqdm(df_qrels.iterrows(), total=df_qrels.shape[0], desc="Creating triples"):
        qid = row['qid']
        positive_pid = row['pid']
        negative_pid = random.choice(list(pid_set - set([positive_pid])))
        triples.append({
            "qid": qid,
            "positive_pid": positive_pid,
            "negative_pid": negative_pid
        })
    return pd.DataFrame(triples)

if __name__ == "__main__":
    df_qrels = pd.read_csv("final_qrels.tsv", sep="\t", header=0, names=["qid", "pid"])
    df_collection = pd.read_csv("final_collection.tsv", sep="\t", header=0, names=["pid", "title", "passage_text"])
    df_triples = create_triples(df_qrels, df_collection, num_negatives=1)
    df_triples.to_csv("qidpidtriples.tsv", sep="\t", index=False, header=False)
    print("Đã lưu qidpidtriples.tsv")


Creating triples: 100%|██████████| 49874/49874 [00:05<00:00, 9210.00it/s]


Đã lưu qidpidtriples.tsv


In [123]:
df_collection

Unnamed: 0,pid,title,passage_text
0,61541925,History of artificial neural networks,Artificial neural networks (ANNs) are models c...
1,61547718,Mathematics of artificial neural networks,An artificial neural network (ANN) combines bi...
2,21523,Neural network (machine learning),"In machine learning, a neural network (also ar..."
3,78113392,A logical calculus of the ideas immanent in ne...,A logical calculus of the ideas immanent to ne...
4,65434605,Ablation (artificial intelligence),"In artificial intelligence (AI), particularly ..."
...,...,...,...
4193,1634778,Rough set,"In computer science, a rough set, first descri..."
4194,71867310,Safety and liveness properties,Properties of an execution of a computer progr...
4195,2058995,Scientific community metaphor,"In computer science, the scientific community ..."
4196,77060756,Selman's theorem,"In computability theory, Selman's theorem is a..."


In [124]:
df_triples.iloc[3,:]

qid                    0
positive_pid    39773873
negative_pid    50218929
Name: 3, dtype: int64

In [125]:
df_collection[df_collection['pid'] == 32692970]

Unnamed: 0,pid,title,passage_text
408,32692970,Recon Instruments,Recon Instruments was a Canadian technology co...


In [132]:
print(
    df_queries[df_queries['qid'] == df_triples['qid'][5]]['query_text']
)
print(
    df_collection[df_collection['pid'] == df_triples['positive_pid'][5]]['title'],
)
print(
    df_collection[df_collection['pid'] == df_triples['negative_pid'][5]]['title'],
)

1    What is the significance of the 'AI winter' pe...
Name: query_text, dtype: object
1404    Elements of AI
Name: title, dtype: object
3058    Chinese kinship
Name: title, dtype: object
