In [3]:
import requests
import numpy as np

In [4]:
def save_documents(url, categories, medical): 
    for c in categories:
        print(c)
        params = {
                'action': 'query',
                'format': 'json',
                'cmtitle': c,
                'cmlimit': '100',
                'cmtype': 'page',
                'list': 'categorymembers',
        }
        
        req = requests.get(url=url, params=params)
        pages = req.json()["query"]["categorymembers"]
        
        page_ids = [page["pageid"] for page in pages]
        print(len(page_ids))
        
        for id in page_ids:
            print(f"Scraping page: {id}")
            content = get_content(url, id)
            filename = f"Corpora/Medical/{id}.txt" if medical else f"Corpora/NonMedical/{id}.txt"
            with open(filename, "w") as file:
                file.write(content)

def get_content(url, id):
    params = {
        "action": "query",
        "format": "json",
        "prop": "extracts",
        "pageids": id,
        "explaintext" : "1",
    }
            
    req = requests.get(url=url, params=params)
    content = req.json()["query"]["pages"][str(id)]["extract"]
    
    return content
    

In [12]:
def retrieve_documents():
    url = 'https://en.wikipedia.org/w/api.php'
    
    medical_categories = [
        "Category:Bacteriology",
        "Category:Virology",
        "Category:Cancer",
        "Category:Anatomy",
        "Category:Genetics",
        "Category:Pediatrics",
    ]
    
    non_medical_categories = [
        "Category:Geometry",
        "Category:Literature",
        "Category:Hunting",
        "Category:Politics",
        "Category:Education",
        "Category:Fashion",
    ]
    
    save_documents(url, medical_categories, medical=True)

In [13]:
retrieve_documents()

Category:Bacteriology
100
Scraping page: 42766846
Scraping page: 58475
Scraping page: 2644968
Scraping page: 10323278
Scraping page: 6148256
Scraping page: 33821697
Scraping page: 28066105
Scraping page: 8868378
Scraping page: 39653832
Scraping page: 9028799
Scraping page: 53190588
Scraping page: 2580748
Scraping page: 1368466
Scraping page: 4209093
Scraping page: 4460
Scraping page: 40138
Scraping page: 35547268
Scraping page: 17575156
Scraping page: 22938418
Scraping page: 1110607
Scraping page: 16615241
Scraping page: 33821608
Scraping page: 33821528
Scraping page: 67479291
Scraping page: 33821580
Scraping page: 33821481
Scraping page: 33821340
Scraping page: 52791986
Scraping page: 34067938
Scraping page: 47852064
Scraping page: 20815112
Scraping page: 3690837
Scraping page: 43946
Scraping page: 42689156
Scraping page: 61641456
Scraping page: 56209785
Scraping page: 45523355
Scraping page: 42326835
Scraping page: 25162968
Scraping page: 31882943
Scraping page: 4527097
Scraping page

In [9]:
# popoulates the test set subtrating the 20% of training set elements
import os
import shutil

def populate_test_set(): 
    counts = [0, 0]
    medical_documents = os.listdir('Corpora/Medical')
    non_medical_documents = os.listdir('Corpora/NonMedical')
    
    counts[0] = len(os.listdir('Corpora/NonMedical'))
    counts[1] = len(os.listdir('Corpora/Medical'))
    
    numer_of_documents = np.sum(counts)
    probabilities = [counts[0]/numer_of_documents, counts[1]/numer_of_documents]
    
    for _ in range(int(numer_of_documents*0.2)):
        medical = np.random.choice([False, True], p=probabilities)
        
        document_index = None
        
        if medical:
            document_index = np.random.randint(0, counts[1])
        else:
            document_index = np.random.randint(0, counts[0])
            
        if medical:
            shutil.move(f"Corpora/Medical/{medical_documents[document_index]}", f"Test/TestSet/{medical_documents[document_index]}")
            with open("Test/test_labels.txt", "a") as f:
                f.write("1\n")
            del medical_documents[document_index]
            counts[1] -= 1
        else:
            shutil.move(f"Corpora/NonMedical/{non_medical_documents[document_index]}", f"Test/TestSet/{non_medical_documents[document_index]}")
            with open("Test/test_labels.txt", "a") as f:
                f.write("0\n")
            del non_medical_documents[document_index]
            counts[0] -= 1

In [14]:
populate_test_set()