In [10]:
import requests
import numpy as np

In [2]:
def save_documents(url, categories, medical): 
    for c in categories:
        print(c)
        params = {
                'action': 'query',
                'format': 'json',
                'cmtitle': c,
                'cmlimit': '100',
                'cmtype': 'page',
                'list': 'categorymembers',
        }
        
        req = requests.get(url=url, params=params)
        pages = req.json()["query"]["categorymembers"]
        
        page_ids = [page["pageid"] for page in pages]
        print(len(page_ids))
        
        for id in page_ids:
            print(f"Scraping page: {id}")
            content = get_content(url, id)
            filename = f"Corpora/Medical/{id}.txt" if medical else f"Corpora/NonMedical/{id}.txt"
            with open(filename, "w") as file:
                file.write(content)

def get_content(url, id):
    params = {
        "action": "query",
        "format": "json",
        "prop": "extracts",
        "pageids": id,
        "explaintext" : "1",
    }
            
    req = requests.get(url=url, params=params)
    content = req.json()["query"]["pages"][str(id)]["extract"]
    
    return content
    

In [5]:
def retrieve_documents():
    url = 'https://en.wikipedia.org/w/api.php'
    
    medical_categories = [
        "Category:Bacteriology",
        "Category:Virology",
        "Category:Cancer",
        "Category:Anatomy",
        "Category:Genetics",
        "Category:Pediatrics",
    ]
    
    non_medical_categories = [
        "Category:Geometry",
        "Category:Literature",
        "Category:Hunting",
        "Category:Politics",
        "Category:Education",
        "Category:Fashion",
    ]
    
    save_documents(url, non_medical_categories, medical=False)

In [6]:
retrieve_documents()

Category:Geometry
100
Scraping page: 18973446
Scraping page: 14201348
Scraping page: 350906
Scraping page: 6473626
Scraping page: 19374248
Scraping page: 11953
Scraping page: 1359832
Scraping page: 15584482
Scraping page: 501447
Scraping page: 62891333
Scraping page: 55866076
Scraping page: 46587298
Scraping page: 2854628
Scraping page: 40564571
Scraping page: 28177884
Scraping page: 47434936
Scraping page: 41452559
Scraping page: 40054744
Scraping page: 19697484
Scraping page: 52777879
Scraping page: 52657328
Scraping page: 49342572
Scraping page: 50529465
Scraping page: 59777041
Scraping page: 63934091
Scraping page: 18583225
Scraping page: 2850640
Scraping page: 491964
Scraping page: 62338906
Scraping page: 273329
Scraping page: 21697672
Scraping page: 297947
Scraping page: 542587
Scraping page: 36617985
Scraping page: 7975294
Scraping page: 5381096
Scraping page: 48520204
Scraping page: 23429522
Scraping page: 3407326
Scraping page: 52080279
Scraping page: 45197932
Scraping page: 1

In [26]:
import random
import math
import os
import shutil

def populate_test_set():
    # count how many documents are in the medical and non-medical training sets
    medical_documents = os.listdir('Corpora/Medical')
    non_medical_documents = os.listdir('Corpora/NonMedical')
    
    count_medical = len(os.listdir('Corpora/Medical'))   # n_documents_non_medical
    count_non_medical = len(os.listdir('Corpora/NonMedical'))      # n_documents_medical
    
    n_documents = count_medical + count_non_medical # n_total_documents
    
    # compute the probabilities of having a medical or a non-medical document by n_documents_category / n_total_documents
    prob_medical = count_medical / n_documents
    prob_non_medical = count_non_medical / n_documents
    
    n_documents_training = math.ceil(n_documents * 0.2)     # number of documents in the training set
    n_documents_medical = int(n_documents_training * prob_medical)  # number of medical documents in the training set
    n_documents_non_medical = int(n_documents_training * prob_non_medical)  # number of non-medical documents in the training set
    
    # sample medical and non-medical documents, each with the correct number
    extracted_medical_documents = random.sample(medical_documents, n_documents_medical)
    print(f"number of medical documents: {len(extracted_medical_documents)}")
    extracted_non_medical_documents = random.sample(non_medical_documents, n_documents_non_medical)
    print(f"number of non medical documents: {len(extracted_non_medical_documents)}")

    # merge the two populations and shuffle them
    extracted_documents = extracted_medical_documents + extracted_non_medical_documents
    random.shuffle(extracted_documents)
        
    # move the documents in the Test Set
    for d in extracted_documents:    
        if d in extracted_medical_documents:
            shutil.move(f"Corpora/Medical/{d}", f"Test/TestSet/{d}")
            with open("Test/test_labels.txt", "a") as f:
                    f.write("1\n")
        else:
            shutil.move(f"Corpora/NonMedical/{d}", f"Test/TestSet/{d}")
            with open("Test/test_labels.txt", "a") as f:
                    f.write("0\n")   

In [27]:
populate_test_set()

number of medical documents: 117
number of non medical documents: 88
