In [3]:
%matplotlib inline
import os
import tarfile as tar
import re
import numpy as np
import math

data_dir = "./data/"
file = tar.open(data_dir + "20news-18828.tar.gz")
members = file.getmembers()

root = members[0].name
groups = ["alt.atheism", "comp.graphics", "sci.space", "talk.religion.misc"]
extract = []
for m in members:
    match = re.search("(?<=/)[a-z.]+(?=/)", m.name)
    if match is not None:
        if match.group(0) in groups:
            extract.append(m)
            
file.extractall(data_dir, extract)

In [4]:
base_dir = data_dir + root + "/"
dirs = os.listdir(base_dir)
labeled_docs = []
for d in dirs:
    files = os.listdir(base_dir + d)
    labeled_doc = []
    for doc in files:
        f = open(base_dir + d + "/" + doc, encoding="latin-1")
        labeled_doc = [f.read(), d]
        labeled_docs.append(labeled_doc)

permuted = np.random.permutation(labeled_docs)

labels = []
docs = []
for ld in permuted:
    labels.append(ld[1])
    docs.append(ld[0])

def strip_header(text):
    _before , _blankline , after = text.partition("\n\n")
    return after

docs = [strip_header(text) for text in docs]
len(docs)

3387

In [5]:
def union(l1, l2):
    return list(set(l1) | set(l2))

def tokenize_docs(docs):
    tokenized_docs = []
    for i, txtline in enumerate(docs):
        l = re.compile(r"(?u)\b\w\w+\b").findall(txtline.lower())
        tokenized_docs.append(l)
    return tokenized_docs

def get_vocabulary(tokenized_docs):
    vocabulary = []
    for td in tokenized_docs:
        vocabulary = union(vocabulary, td)
    return vocabulary

tokenized_docs = tokenize_docs(docs)
vocabulary = get_vocabulary(tokenized_docs)
len(vocabulary)

41777

In [6]:
def get_tokens(tokenized_doc):
    tokens = [0] * len(vocabulary)
    for t in tokenized_doc:
        tokens[vocabulary.index(t)] -= -1
    return tokens

tokens_doc0 = get_tokens(tokenized_docs[0])
len(tokens_doc0)

41777

In [7]:
def add_tokens(t1, t2):
    added = []
    for i, t in enumerate(t1):
        added.append(t + t2[i])
    return added

tokens_doc1 = get_tokens(tokenized_docs[1])
added = add_tokens(tokens_doc0, tokens_doc1)
print("Tokens doc 0 index 288: " + str(tokens_doc0[288]) + "\nTokens doc 1 index 288: " + str(tokens_doc1[288]))
print("Sum: " + str(added[288]))

Tokens doc 0 index 288: 0
Tokens doc 1 index 288: 0
Sum: 0


In [8]:
def get_class_tokens(c, labels, tokenized_docs, vocabulary):
    class_tokens = [0] * len(vocabulary)
    indices = [i for i, x in enumerate(labels) if x == c]
    for i in indices:
        class_tokens = add_tokens(class_tokens, get_tokens(tokenized_docs[i]))
    return class_tokens

def docs_in_class(c):
    return labels.count(c)

def get_docs_of_class(c, labels, docs):
    c_docs = []
    indices = [i for i, x in enumerate(labels) if x == c]
    for i in indices:
        c_docs.append(docs[i])
    return c_docs

def train_MNNB(classes, class_tokens, cnt_docs):
    prior = []
    cond_prob = []
    for i, c in enumerate(classes):
        prior.append(docs_in_class(c) / cnt_docs)
        class_cond_prob = []
        for t in class_tokens[i]:
            class_cond_prob.append((t + 1) / (sum(class_tokens[i]) + len(vocabulary)))
        cond_prob.append(class_cond_prob)
    return prior, cond_prob

cut = int(len(docs) * 0.6)
train_labels = labels[:cut]
train_tokenized_docs = tokenized_docs[:cut]
class_tokens = []
for c in groups:
    class_tokens.append(get_class_tokens(c, train_labels, train_tokenized_docs, vocabulary))

prior, cond_prob = train_MNNB(groups, class_tokens, len(train_tokenized_docs))

In [9]:
prior

[0.39320866141732286,
 0.47883858267716534,
 0.48572834645669294,
 0.3090551181102362]

In [10]:
def apply_MNNB(classes, doc, prior, cond_prob):
    scores = []
    score = 0
    for i, c in enumerate(classes):
        score = math.log(prior[i])
        for t in doc:
            score += math.log(cond_prob[i][vocabulary.index(t)])
        scores.append(score)
    return scores

In [11]:
train_correct = 0
for i, doc in enumerate(train_tokenized_docs):
    scores = apply_MNNB(groups, doc, prior, cond_prob)
    if groups[scores.index(max(scores))] == train_labels[i]:
        train_correct -= -1

train_correct / len(train_tokenized_docs)

0.9758858267716536

In [12]:
test_labels = labels[cut:]
test_tokenized_docs = tokenized_docs[cut:]
test_correct = 0
for i, doc in enumerate(test_tokenized_docs):
    scores = apply_MNNB(groups, doc, prior, cond_prob)
    print("Classified: " + groups[scores.index(max(scores))] + " / Labeled: " + test_labels[i])
    if groups[scores.index(max(scores))] == test_labels[i]:
        test_correct -= -1

test_correct / len(test_tokenized_docs)

Classified: comp.graphics / Labeled: comp.graphics
Classified: comp.graphics / Labeled: comp.graphics
Classified: sci.space / Labeled: sci.space
Classified: alt.atheism / Labeled: alt.atheism
Classified: comp.graphics / Labeled: comp.graphics
Classified: comp.graphics / Labeled: comp.graphics
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: comp.graphics / Labeled: comp.graphics
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: alt.atheism / Labeled: alt.atheism
Classified: comp.graphics / Labeled: comp.graphics
Classified: comp.graphics / Labeled: comp.graphics
Classified: alt.atheism / Labeled: comp.graphics
Classified: alt.atheism / Labeled: alt.atheism
Classified: talk.religion.misc / Labeled: alt.atheism
Classified: alt.atheism / Labeled: comp.graphics
Classified: talk.religion.misc / Labeled: alt.atheism
Classified: alt.atheism / Labeled: alt.atheism
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: sci.spac

Classified: sci.space / Labeled: sci.space
Classified: alt.atheism / Labeled: alt.atheism
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: alt.atheism / Labeled: alt.atheism
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: comp.graphics / Labeled: comp.graphics
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: talk.religion.misc / Labeled: alt.atheism
Classified: comp.graphics / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: comp.graphics / Labeled: comp.graphics
Classified: alt.atheism / Labeled: alt.atheism
Classified: alt.atheism / Labeled: alt.atheism
Classified: comp.graphics / Labeled: comp.graphics
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: alt.atheism / Labeled: talk.religion.misc
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: alt.athe

Classified: comp.graphics / Labeled: comp.graphics
Classified: alt.atheism / Labeled: alt.atheism
Classified: alt.atheism / Labeled: alt.atheism
Classified: comp.graphics / Labeled: comp.graphics
Classified: sci.space / Labeled: talk.religion.misc
Classified: sci.space / Labeled: sci.space
Classified: comp.graphics / Labeled: comp.graphics
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: comp.graphics
Classified: alt.atheism / Labeled: alt.atheism
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: alt.atheism / Labeled: alt.atheism
Classified: sci.space / Labeled: comp.graphics
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: comp.graphics / Labeled: comp.graphics
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: alt.atheism / Labeled: alt.atheism
Classified: sci.space / Labeled: sc

Classified: comp.graphics / Labeled: comp.graphics
Classified: sci.space / Labeled: talk.religion.misc
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: alt.atheism / Labeled: talk.religion.misc
Classified: comp.graphics / Labeled: comp.graphics
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: comp.graphics / Labeled: comp.graphics
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: comp.graphics / Labeled: comp.graphics
Classified: sci.space / Labeled: sci.space
Classified: comp.graphics / Labeled: comp.graphics
Classified: alt.atheism / Labeled: alt.atheism
Classified: comp.graphics / Labeled: comp.graphics
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: sci.space / Labeled: sci.space
Classified: alt.atheism / Labeled: alt.atheism
Classified

Classified: comp.graphics / Labeled: comp.graphics
Classified: alt.atheism / Labeled: alt.atheism
Classified: alt.atheism / Labeled: alt.atheism
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: sci.space / Labeled: sci.space
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: alt.atheism / Labeled: alt.atheism
Classified: comp.graphics / Labeled: comp.graphics
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: alt.atheism / Labeled: alt.atheism
Classified: sci.space / Labeled: sci.space
Classified: alt.atheism / Labeled: alt.atheism
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: alt.atheism / Labeled: alt.atheism
Classified: comp.graphics / Labeled: comp.graphics
Classified: alt.atheism / Labeled: talk.religion.misc
Classified: sci.space / Labeled: sci.space
Classified: alt.atheism / Labeled: alt.atheism
Classified: alt.atheism / Labe

Classified: comp.graphics / Labeled: comp.graphics
Classified: alt.atheism / Labeled: alt.atheism
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: comp.graphics / Labeled: comp.graphics
Classified: sci.space / Labeled: sci.space
Classified: alt.atheism / Labeled: alt.atheism
Classified: sci.space / Labeled: sci.space
Classified: comp.graphics / Labeled: comp.graphics
Classified: alt.atheism / Labeled: alt.atheism
Classified: alt.atheism / Labeled: alt.atheism
Classified: alt.atheism / Labeled: alt.atheism
Classified: alt.atheism / Labeled: alt.atheism
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: alt.atheism / Labeled: alt.atheism
Classified: alt.atheism / Labeled: alt.atheism
Classified: talk.religion.misc / Labeled: comp.graphics
Classified: comp.graphics / Labeled: comp.graphics
Classified: sci.space / Labeled: sci.space
Cl

Classified: alt.atheism / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: alt.atheism / Labeled: alt.atheism
Classified: comp.graphics / Labeled: comp.graphics
Classified: sci.space / Labeled: sci.space
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: comp.graphics / Labeled: comp.graphics
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: comp.graphics / Labeled: talk.religion.misc
Classified: comp.graphics / Labeled: comp.graphics
Classified: comp.graphics / Labeled: comp.graphics
Classified: alt.atheism / Labeled: talk.religion.misc
Classified: alt.atheism / Labeled: alt.atheism
Classified: sci.space / Labeled: sci.space
Classified: comp.graphics / Labeled: comp.graphics
Classified: alt.atheism / Labeled: talk.religion.misc
Classified: comp.graphics / Labeled: comp.graphics
Classified: comp.graphics / Labeled: comp.graphics
Cla

Classified: sci.space / Labeled: sci.space
Classified: alt.atheism / Labeled: alt.atheism
Classified: alt.atheism / Labeled: alt.atheism
Classified: alt.atheism / Labeled: talk.religion.misc
Classified: alt.atheism / Labeled: alt.atheism
Classified: alt.atheism / Labeled: alt.atheism
Classified: alt.atheism / Labeled: alt.atheism
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: comp.graphics / Labeled: comp.graphics
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: alt.atheism / Labeled: alt.atheism
Classified: comp.graphics / Labeled: comp.graphics
Classified: sci.space / Labeled: sci.space
Classified: alt.atheism / Labeled: alt.atheism
Classified: comp.graphics / Labeled: comp.graphics
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: comp.graphics / Labeled: comp.graphics
Classified: sci.space / Labeled: sci.space
Classified: comp.graphics / Labeled: comp.graphics
Classified: alt.atheism /

Classified: sci.space / Labeled: alt.atheism
Classified: alt.atheism / Labeled: talk.religion.misc
Classified: alt.atheism / Labeled: alt.atheism
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: comp.graphics / Labeled: comp.graphics
Classified: sci.space / Labeled: sci.space
Classified: alt.atheism / Labeled: alt.atheism
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: alt.atheism / Labeled: alt.atheism
Classified: alt.atheism / Labeled: alt.atheism
Classified: comp.graphics / Labeled: comp.graphics
Classified: alt.atheism / Labeled: alt.atheism
Classified: comp.graphics / Labeled: comp.graphics
Classified: sci.space / Labeled: sci.space
Classified: alt.atheism / Labeled: alt.atheism
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: sci.space / Labeled: sci.space
Classified: talk.religion.misc / Labeled: talk.religion.misc
Classified: alt.atheism / Labeled: alt.atheism
Classified: sci.space

0.9136531365313653