In [71]:
import requests
import numpy as np
from nltk import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import glob

In [72]:
def save_documents(url, categories, medical): 
    for c in categories:
        print(c)
        params = {
                'action': 'query',
                'format': 'json',
                'cmtitle': c,
                'cmlimit': '100',
                'cmtype': 'page',
                'list': 'categorymembers',
        }
        
        req = requests.get(url=url, params=params)
        pages = req.json()["query"]["categorymembers"]
        
        page_ids = [page["pageid"] for page in pages]
        
        for id in page_ids:
            print(f"Scraping page: {id}")
            content = get_content(url, id)
            filename = f"Corpora/Medical/{id}.txt" if medical else f"Corpora/NonMedical/{id}.txt"
            with open(filename, "w") as file:
                file.write(content)

def get_content(url, id):
    params = {
        "action": "query",
        "format": "json",
        "prop": "extracts",
        "pageids": id,
        "explaintext" : "1",
    }
            
    req = requests.get(url=url, params=params)
    content = req.json()["query"]["pages"][str(id)]["extract"]
    
    return content
    

In [96]:
def retrieve_documents():
    url = 'https://en.wikipedia.org/w/api.php'
    
    medical_categories = [
        "Category:Bacteriology",
        "Category:Virology",
        "Category:Cancer",
        "Category:Anatomy",
        "Category:Genetics",
        "Category:Pediatrics",
    ]
    
    non_medical_categories = [
        "Category:Culture",
        "Category:Literature",
        "Category:Hunting",
        "Category:Politics",
        "Category:Fashion"
        "Category:Architecture"
    ]
    
    save_documents(url, non_medical_categories, medical=False)

In [109]:
def bag_of_words(path):
    BoW = {}
    
    files = sorted(glob.glob(path))
    
    for file in files: 
        with open(file, "r") as f: 
            data = f.read()
            
            current_bag_of_words = normalize(data)
            
            # merge this bag of word into the category bag_of_word
            for word, count in current_bag_of_words.items():
                if word not in BoW.keys():
                    BoW.update({word:count})
                else:
                    BoW.update({word:(BoW.get(word) + count)})
    
    return BoW

In [110]:
def normalize(data):
    """
    Normalizes a file by means of tokenization, stemming, stopwords elimination, returning its representation as a Bag of Words
    
    :param data: the file we want to normalize represented as a string
    :return: the bag of words representation of the input file  
    """
    
    stemmer = PorterStemmer()
    stopwords_list = stopwords.words('english')
    tokenizer = WordPunctTokenizer()
    
    file_bag_of_words = {}
    
    tokens = tokenizer.tokenize(data)   # tokenization
    
    for token in tokens:    # for each token check that it is significant (not a stopword and longer than 3)
        if token not in stopwords_list and len(token) > 3:
            stem = stemmer.stem(word=token, to_lowercase=True)  # stemming
            
            # the string '== Section Name ==' is used to divide sections, don't want to include this tokens
            if '=' not in stem:
                if stem not in file_bag_of_words:
                    file_bag_of_words.update({stem: 1})
                else:
                    value = file_bag_of_words.get(stem)
                    file_bag_of_words.update({stem: value+1})
    
    return file_bag_of_words

In [111]:
def classify(path, vocab: dict, medical_bag: dict, non_medical_bag: dict, medical_prob, non_medical_prob):
    files = glob.glob(path)
    
    labels = []
    
    for file in files:
        likelihoods = [non_medical_prob, medical_prob]
        with open(file, "r") as f: 
            data = f.read()
             
            file_BoW = normalize(data)
            
            # actual classification

            for word in file_BoW:
                if word in medical_bag:
                    likelihoods[1] += np.log(medical_bag.get(word) / vocab.get(word))
                if word in non_medical_bag:
                    likelihoods[0] += np.log(non_medical_bag.get(word) / vocab.get(word))
                
            
        labels.append(np.argmax(likelihoods))
    
    return labels       

In [112]:
def vocabulary(medical_BoW: dict, non_medical_BoW: dict):
    vocab = {}
    for word, count in medical_BoW.items():
        vocab.update({word:count})
    
    for word, count in non_medical_BoW.items():
        if word not in vocab.keys():
            vocab.update({word:count})
        else:
            vocab.update({word:vocab.get(word) + count})
        
    return vocab

In [97]:
retrieve_documents()

Category:Culture
Scraping page: 19159508
Scraping page: 24723521
Scraping page: 72135653
Scraping page: 74649989
Scraping page: 18290472
Scraping page: 29560452
Scraping page: 53169305
Scraping page: 505730
Scraping page: 43569192
Scraping page: 6258
Scraping page: 590768
Scraping page: 25147220
Scraping page: 18964621
Scraping page: 30963584
Scraping page: 54004404
Scraping page: 50693529
Scraping page: 67233436
Scraping page: 164660
Scraping page: 69411572
Scraping page: 4543340
Scraping page: 66428540
Scraping page: 2036118
Scraping page: 57165694
Scraping page: 9057549
Scraping page: 5903
Scraping page: 30487581
Scraping page: 14690776
Scraping page: 9020225
Scraping page: 13144407
Scraping page: 33301100
Scraping page: 9216811
Scraping page: 7745490
Scraping page: 12593785
Scraping page: 143364
Scraping page: 1654632
Scraping page: 42730418
Scraping page: 32962014
Scraping page: 60852572
Scraping page: 12401182
Scraping page: 62379378
Scraping page: 323912
Scraping page: 13775689


In [113]:
medical_bag_of_words = bag_of_words('Corpora/Medical/*.txt')
non_medical_bag_of_words = bag_of_words('Corpora/NonMedical/*.txt')

medical_prior = 584 / 997
non_medical_prior = 413 / 997

vocabulary = vocabulary(medical_bag_of_words, non_medical_bag_of_words)

In [115]:
predicted_labels = classify(path='Test/TestSet/*.txt', medical_bag=medical_bag_of_words, 
                            non_medical_bag=non_medical_bag_of_words, vocab=vocabulary,
                            medical_prob=medical_prior, non_medical_prob=non_medical_prior)

true_labels = []
with open('Test/test_labels.txt', 'r') as f:
    lines = f.readlines()
    
    for line in lines:
        true_labels.append(eval(line))

correct = 0
for i in range(len(predicted_labels)):
    if predicted_labels[i] == true_labels[i]:
        correct += 1

print(f"The total number of correct labels is: {correct}, which yields an accuracy of {correct/len(predicted_labels)}")

The total number of correct labels is: 191, which yields an accuracy of 0.9597989949748744


In [107]:
# popoulates the test set subtrating the 20% of training set elements
import os
import shutil

counts = [0, 0]
medical_documents = os.listdir('Corpora/Medical')
non_medical_documents = os.listdir('Corpora/NonMedical')

counts[0] = len(os.listdir('Corpora/NonMedical'))
counts[1] = len(os.listdir('Corpora/Medical'))

numer_of_documents = np.sum(counts)
probabilities = [counts[0]/numer_of_documents, counts[1]/numer_of_documents]

for _ in range(int(numer_of_documents*0.2)):
    medical = np.random.choice([False, True], p=probabilities)
    
    document_index = None
    
    if medical:
        document_index = np.random.randint(0, counts[1])
    else:
        document_index = np.random.randint(0, counts[0])
        
    if medical:
        shutil.move(f"Corpora/Medical/{medical_documents[document_index]}", f"Test/TestSet/{medical_documents[document_index]}")
        with open("Test/test_labels.txt", "a") as f:
            f.write("1\n")
        del medical_documents[document_index]
        counts[1] -= 1
    else:
        shutil.move(f"Corpora/NonMedical/{non_medical_documents[document_index]}", f"Test/TestSet/{non_medical_documents[document_index]}")
        with open("Test/test_labels.txt", "a") as f:
            f.write("0\n")
        del non_medical_documents[document_index]
        counts[0] -= 1